**<h1>PREPROCESSING</h1>**

In [1]:
import pandas as pd

<h3>We check for duplicates and drop every duplicate event</h3>

In [None]:
touch = pd.read_csv("DATA/touchevent.csv")
touch['timestamp'] = pd.to_datetime(touch['timestamp'], format='%Y%m%d%H%M%S%f')
if touch['timestamp'].duplicated().any() == True:
    touch = touch.drop_duplicates()
touch.to_csv("DATA/touches_events_duplicates_dropped.csv")

In [2]:
df_touches = pd.read_csv("DATA/touches_events_duplicates_dropped.csv")

In [3]:
df_touches = df_touches.drop("Unnamed: 0", axis=1)
df_touches.day.describe()

count    1.985914e+07
mean     2.020115e+07
std      4.047025e+01
min      2.020111e+07
25%      2.020112e+07
50%      2.020112e+07
75%      2.020120e+07
max      2.020122e+07
Name: day, dtype: float64

<h2>WE FILTER THE DATASET SAVING ONLY RECORDS OF THE FIRST TWO WEEKS</h2>

In [4]:
df_touches = df_touches[(df_touches['day']>=20201110) & (df_touches['day']<=20201130)]
df_touches

Unnamed: 0,experimentid,userid,day,timestamp
0,wenetItaly,144,20201117,2020-11-17 23:57:50
1,wenetItaly,144,20201117,2020-11-17 23:57:48
2,wenetItaly,144,20201117,2020-11-17 23:55:26
3,wenetItaly,144,20201117,2020-11-17 23:55:25
4,wenetItaly,144,20201117,2020-11-17 23:55:24
...,...,...,...,...
19859133,wenetItaly,55,20201128,2020-11-28 03:15:13
19859134,wenetItaly,55,20201128,2020-11-28 03:15:12
19859135,wenetItaly,55,20201128,2020-11-28 03:15:10
19859136,wenetItaly,55,20201128,2020-11-28 03:15:05


In [7]:
df_touches.to_csv("DATA/touch_events_f2w.csv")

**<h2>CONVERT THE TIMESTAMPS TO DATETIME FORMAT</h2>**

In [8]:
df_touches['timestamp'] = pd.to_datetime(df_touches['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

**<h2>REDUCE THE TIMESTAMPS TO 1 SECOND AND COUNT THE TOUCHES PER SECOND</h2>**

In [9]:
df_touches['timestamp'] = df_touches['timestamp'].dt.floor('S')
touch_count = df_touches.groupby(['userid','timestamp']).size().reset_index(name='touch_count')
result_df = df_touches.merge(touch_count, on=['userid','timestamp'])
if result_df['timestamp'].duplicated().any() == True:
    result_df = result_df.drop_duplicates()

In [11]:
result_df

Unnamed: 0,experimentid,userid,day,timestamp,touch_count
0,wenetItaly,144,20201117,2020-11-17 23:57:50,1
1,wenetItaly,144,20201117,2020-11-17 23:57:48,1
2,wenetItaly,144,20201117,2020-11-17 23:55:26,1
3,wenetItaly,144,20201117,2020-11-17 23:55:25,1
4,wenetItaly,144,20201117,2020-11-17 23:55:24,1
...,...,...,...,...,...
13451536,wenetItaly,55,20201128,2020-11-28 03:15:13,1
13451537,wenetItaly,55,20201128,2020-11-28 03:15:12,1
13451538,wenetItaly,55,20201128,2020-11-28 03:15:10,1
13451539,wenetItaly,55,20201128,2020-11-28 03:15:05,1


In [12]:
result_df.to_csv("DATA/touches_per_second.csv")

**<h2>CREATE DATASET ROUNDED AT 1 MINUTE</h2>**

In [2]:
df_touches = pd.read_csv("DATA/touch_events_f2w.csv")

In [3]:
df_touches = df_touches.drop("Unnamed: 0", axis=1)
df_touches

Unnamed: 0,experimentid,userid,day,timestamp
0,wenetItaly,144,20201117,2020-11-17 23:57:50
1,wenetItaly,144,20201117,2020-11-17 23:57:48
2,wenetItaly,144,20201117,2020-11-17 23:55:26
3,wenetItaly,144,20201117,2020-11-17 23:55:25
4,wenetItaly,144,20201117,2020-11-17 23:55:24
...,...,...,...,...
13451536,wenetItaly,55,20201128,2020-11-28 03:15:13
13451537,wenetItaly,55,20201128,2020-11-28 03:15:12
13451538,wenetItaly,55,20201128,2020-11-28 03:15:10
13451539,wenetItaly,55,20201128,2020-11-28 03:15:05


In [4]:
df_touches['timestamp'] = pd.to_datetime(df_touches['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
df_touches['timestamp'] = df_touches['timestamp'].dt.floor('T')
touch_count = df_touches.groupby(['userid','timestamp']).size().reset_index(name='touch_count')
result_df_1min = df_touches.merge(touch_count, on=['userid','timestamp'])

In [5]:
if result_df_1min['timestamp'].duplicated().any() == True:
    result_df_1min = result_df_1min.drop_duplicates()

In [6]:
result_df_1min

Unnamed: 0,experimentid,userid,day,timestamp,touch_count
0,wenetItaly,144,20201117,2020-11-17 23:57:00,2
2,wenetItaly,144,20201117,2020-11-17 23:55:00,7
9,wenetItaly,144,20201117,2020-11-17 23:54:00,9
18,wenetItaly,144,20201117,2020-11-17 23:53:00,2
20,wenetItaly,144,20201117,2020-11-17 23:51:00,30
...,...,...,...,...,...
13451450,wenetItaly,55,20201128,2020-11-28 03:19:00,12
13451462,wenetItaly,55,20201128,2020-11-28 03:18:00,1
13451463,wenetItaly,55,20201128,2020-11-28 03:17:00,3
13451466,wenetItaly,55,20201128,2020-11-28 03:16:00,42


In [7]:
result_df_1min.to_csv("DATA/touches_per_minute.csv")

**<h2>CREATE DATASET ROUNDED AT 30 MINUTES</h2>**

In [3]:
df_touches = df_touches.drop("Unnamed: 0", axis=1)
df_touches

Unnamed: 0,experimentid,userid,day,timestamp
0,wenetItaly,144,20201117,2020-11-17 23:57:50
1,wenetItaly,144,20201117,2020-11-17 23:57:48
2,wenetItaly,144,20201117,2020-11-17 23:55:26
3,wenetItaly,144,20201117,2020-11-17 23:55:25
4,wenetItaly,144,20201117,2020-11-17 23:55:24
...,...,...,...,...
13451536,wenetItaly,55,20201128,2020-11-28 03:15:13
13451537,wenetItaly,55,20201128,2020-11-28 03:15:12
13451538,wenetItaly,55,20201128,2020-11-28 03:15:10
13451539,wenetItaly,55,20201128,2020-11-28 03:15:05


In [4]:
df_touches['timestamp'] = pd.to_datetime(df_touches['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
df_touches['timestamp'] = df_touches['timestamp'].dt.floor('30T')
touch_count = df_touches.groupby(['userid','timestamp']).size().reset_index(name='touch_count')
result_df_30min = df_touches.merge(touch_count, on=['userid','timestamp'])

In [6]:
if result_df_30min['timestamp'].duplicated().any() == True:
    result_df_30min = result_df_30min.drop_duplicates()

In [7]:
result_df_30min

Unnamed: 0,experimentid,userid,day,timestamp,touch_count
0,wenetItaly,144,20201117,2020-11-17 23:30:00,93
93,wenetItaly,144,20201117,2020-11-17 23:00:00,1
94,wenetItaly,144,20201117,2020-11-17 22:00:00,10
104,wenetItaly,144,20201117,2020-11-17 21:30:00,37
141,wenetItaly,144,20201117,2020-11-17 21:00:00,40
...,...,...,...,...,...
13450610,wenetItaly,55,20201128,2020-11-28 09:00:00,287
13450897,wenetItaly,55,20201128,2020-11-28 08:30:00,335
13451232,wenetItaly,55,20201128,2020-11-28 08:00:00,196
13451428,wenetItaly,55,20201128,2020-11-28 03:30:00,21


In [8]:
result_df_30min.to_csv("DATA/touches_per_30minutes.csv")