In [1]:
import pandas as pd

In [2]:
dizio = pd.read_csv("DATA/dizio_apps.csv")
app = pd.read_csv("DATA/applicationevent.csv")

**<h2>Reformatting application to have a more clear nomenclature</h2>**


In [None]:
app['timestamp'] = pd.to_datetime(app['timestamp'], format='%Y%m%d%H%M%S%f')
result = app.merge(dizio, left_on='applicationname', right_on='appname', how='left')
result = result.drop('appname', axis=1)
result = result.drop('applicationname', axis=1)
result = result.drop('experimentid', axis=1)
result = result.drop('day', axis=1)
result.to_csv("DATA/app_cleaned.csv")

In [None]:
df = result.copy()

**<h2>Sort values per userid and timestamp to calculate the difference between adjacent timestamps</h2>**


In [4]:
df = df.sort_values(by=['userid', 'timestamp'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['time_diff'] = df['timestamp'].diff()


**<h2>Identify when app chages and set start timestamp and end timestamp to calculate, for each userid, the time spent using each app</h2>**


In [None]:
df['app_change'] = df['name'] != df['name'].shift(1)
df['app_group'] = (df['app_change'] | (df['userid'] != df['userid'].shift(1))).cumsum()
result_df = df.groupby(['userid', 'app_group', 'name'])['timestamp'].agg(['min', 'max']).reset_index()
result_df.columns = ['userid', 'app_group', 'name', 'start_timestamp', 'end_timestamp']
result_df.to_csv('DATA/app_cleaned_reduced.csv')

In [2]:
result_df = pd.read_csv("DATA/app_cleaned_reduced.csv")
result_df = result_df.drop("app_group", axis=1)
result_df = result_df.drop("Unnamed: 0", axis=1)
result_df.head()

Unnamed: 0,userid,name,start_timestamp,end_timestamp
0,1,Settings,2020-11-12 11:17:09.528,2020-11-12 11:17:14.540
1,1,Permission Controller,2020-11-12 11:17:19.540,2020-11-12 11:17:19.540
2,1,Samsung One UI Home,2020-11-12 11:17:30.192,2020-11-12 11:17:30.192
3,1,i-Log,2020-11-12 11:17:38.689,2020-11-12 11:17:43.697
4,1,Samsung One UI Home,2020-11-12 11:17:48.706,2020-11-12 11:17:59.655


In [111]:
tdiary = pd.read_stata("DATA/td_ita.dta")

In [10]:
tdiary = tdiary[['id', 'date_not','what', 'withw', 'where']]
tdiary['id'] = tdiary['id'].astype(int)
tdiary = tdiary[tdiary['what']=='Study/work group']
tdiary

Unnamed: 0,id,date_not,what,withw,where
19,0,2020-11-13 09:30:00,Study/work group,Alone,Home apartment /room
21,0,2020-11-13 10:30:00,Study/work group,Alone,Home apartment /room
23,0,2020-11-13 11:30:00,Study/work group,Alone,Home apartment /room
24,0,2020-11-13 12:00:00,Study/work group,Alone,Home apartment /room
34,0,2020-11-13 17:00:00,Study/work group,Alone,Home apartment /room
...,...,...,...,...,...
268452,266,2020-12-11 02:00:00,Study/work group,Alone,Home apartment /room
268453,266,2020-12-11 03:00:00,Study/work group,Alone,Home apartment /room
268467,266,2020-12-11 17:00:00,Study/work group,Alone,Home apartment /room
268468,266,2020-12-11 18:00:00,Study/work group,Alone,Home apartment /room


**<h2>Identify users that answered at more than 80% of the notifications</h2>**


In [112]:
data = tdiary
data_first2w = data[data['first2w'] == 'First two weeks']
total_notifications = data_first2w.groupby('id').size()
responses = data_first2w[data_first2w['what']!="No information"].groupby('id').size()
response_rate = (responses / total_notifications) * 100
users_to_remove = response_rate[response_rate < 80].index
filtered_data = data[~data['id'].isin(users_to_remove)]
filtered_data = filtered_data[['id', 'date_not','what', 'withw', 'where']]
filtered_data

Unnamed: 0,id,date_not,what,withw,where
0,0.0,2020-11-13 00:00:00,No information,No information,No information
1,0.0,2020-11-13 00:30:00,No information,No information,No information
2,0.0,2020-11-13 01:00:00,No information,No information,No information
3,0.0,2020-11-13 01:30:00,No information,No information,No information
4,0.0,2020-11-13 02:00:00,No information,No information,No information
...,...,...,...,...,...
268469,266.0,2020-12-11 19:00:00,Study/work group,Alone,Home apartment /room
268470,266.0,2020-12-11 20:00:00,Other Shopping,Alone,Home apartment /room
268471,266.0,2020-12-11 21:00:00,Other,Alone,Other university place
268472,266.0,2020-12-11 22:00:00,Sleeping,Alone,Home apartment /room


In [114]:
filtered_data.to_csv("DATA/tdiary_filtrato.csv")

**<h2>Create a function to check if a timestamp is within a study session</h2>**

In [115]:
tdiary = pd.read_csv("DATA/tdiary_filtrato.csv")

In [116]:
tdiary['date_not'] = pd.to_datetime(tdiary['date_not'])


def is_within_study_session(row, tdiary):
    date = row['start_timestamp']
    user_id = row['userid']
    study_sessions = tdiary[
        (tdiary['id'] == user_id) &
        (tdiary['date_not'] <= date) &
        (tdiary['date_not'] + pd.DateOffset(minutes=30) >= date)
    ]
    return not study_sessions.empty

In [117]:
result_df['study'] = result_df.apply(is_within_study_session, args=(tdiary,), axis=1)
result_df

Unnamed: 0,userid,name,start_timestamp,end_timestamp,study
0,1,Settings,2020-11-12 11:17:09.528,2020-11-12 11:17:14.540,False
1,1,Permission Controller,2020-11-12 11:17:19.540,2020-11-12 11:17:19.540,False
2,1,Samsung One UI Home,2020-11-12 11:17:30.192,2020-11-12 11:17:30.192,False
3,1,i-Log,2020-11-12 11:17:38.689,2020-11-12 11:17:43.697,False
4,1,Samsung One UI Home,2020-11-12 11:17:48.706,2020-11-12 11:17:59.655,False
...,...,...,...,...,...
557205,264,Scala 40 Online - Card Game,2020-12-12 00:56:37.483,2020-12-12 01:41:55.874,False
557206,264,Samsung One UI Home,2020-12-12 01:42:00.890,2020-12-12 01:42:00.890,False
557207,264,Instagram,2020-12-12 01:42:05.884,2020-12-12 01:46:41.471,False
557208,264,Follower Analyzer for Instagram,2020-12-12 01:46:46.479,2020-12-12 01:51:01.660,False


**<h2>Save two different dataset for studying and not studying</h2>**

In [118]:
study = result_df[result_df['study'] == True]
study.to_csv("DATA/APP_USAGExSTUDY.csv")

In [119]:
not_study = result_df[result_df['study'] == False]
not_study.to_csv("DATA/APP_USAGExNOTSTUDY.csv")

In [120]:
study = pd.read_csv("DATA/APP_USAGExSTUDY.csv")
not_study = pd.read_csv("DATA/APP_USAGExNOTSTUDY.csv")

In [121]:
dizio = pd.read_csv("DATA/dizio_apps.csv")

**<h2>Create and apply a function to reformat from apps names to apps classes</h2>**


In [122]:
names_transformer = dict()
for _, row in dizio.iterrows():
    names_transformer[row['name']]=row['class']
study['name'] = study['name'].map(names_transformer)
not_study['name'] = not_study['name'].map(names_transformer)

In [123]:
catDiz = {
    'Social': ['facebook', 'instagram', 'tiktok', 'social'],
    'Chat': ['communication', 'chat', 'phone/messages', 'whatsapp messenger', 'facebook messenger', 'telegram'],
    'Productivity': ['browser', 'tools', 'edit', 'edit & share docs on the go', 'edit', 'productivity', 'calendars & files', 'excel', 'ocr', 'new & modern web browser', 'private & safe web browser', 'reading'],
    'Media': ['fotolibri', 'music', 'photography', 'media', 'video chat & hang out with friends', 'podcasts & audio stories', 'sound & music', 'trakt client for tv shows', 'journal', 'results & scores', 'market & news', 'news & magazines'],
    'Shopping': ['clothes shopping', 'food shopping', 'shopping', 'shop', 'biglietti', 'food', 'food delivery', 'great pizza'],
    'Game': ['game', 'multiplayer'],
    'Health & Fitness': ['health & fitness', 'cycling & swimming', 'fresh & clean'],
    'Finanza': ['finance', 'money manager'],
    'Altro': ['art', 'travel & local', 'i-log', 'other', 'climate awareness', 'nan', 'javascript', 'free', 'dating', 'fresh & clean', 'results & scores', 'top stories & lifestyle', 'mssql and more', 'the app dei servizi pubblici', 'free', 'tracker', 'android devices', "l'app dei servizi pubblici", 'fertility', 'art', 'sync', 'faster way to search']
}

def rearrange_category(category):
    category = str(category)
    for cat in catDiz.keys():
        if category.lower().strip() in catDiz[cat]:
            return cat
    return 'Altro'


In [124]:
study = study.drop('Unnamed: 0', axis=1)
study = study[study['name']!='Launcher']
study = study[study['name']!='nan']

not_study = not_study.drop('Unnamed: 0', axis=1)
not_study = not_study[not_study['name']!='Launcher']
not_study = not_study[not_study['name']!='nan']

study['name'] = study['name'].map(rearrange_category)
not_study['name'] = not_study['name'].map(rearrange_category)

In [125]:
study

Unnamed: 0,userid,name,start_timestamp,end_timestamp,study
0,1,Social,2020-11-13 00:38:55.823,2020-11-13 00:38:55.823,True
2,1,Productivity,2020-11-13 00:39:06.495,2020-11-13 00:39:06.495,True
4,1,Productivity,2020-11-13 00:39:16.494,2020-11-13 00:39:21.514,True
6,1,Productivity,2020-11-13 08:10:02.180,2020-11-13 08:10:22.953,True
8,1,Productivity,2020-11-13 08:34:02.181,2020-11-13 09:05:48.118,True
...,...,...,...,...,...
324069,264,Chat,2020-12-11 23:17:17.942,2020-12-11 23:18:28.018,True
324070,264,Media,2020-12-11 23:18:33.020,2020-12-11 23:18:58.043,True
324071,264,Altro,2020-12-11 23:19:03.038,2020-12-11 23:19:58.073,True
324072,264,Altro,2020-12-11 23:20:03.076,2020-12-11 23:20:03.076,True


In [126]:
study.to_csv("DATA/CLASS_USAGExSTUDY.csv")

In [127]:
not_study

Unnamed: 0,userid,name,start_timestamp,end_timestamp,study
0,1,Productivity,2020-11-12 11:17:09.528,2020-11-12 11:17:14.540,False
1,1,Productivity,2020-11-12 11:17:19.540,2020-11-12 11:17:19.540,False
3,1,Altro,2020-11-12 11:17:38.689,2020-11-12 11:17:43.697,False
5,1,Chat,2020-11-12 11:18:04.663,2020-11-12 11:21:00.473,False
7,1,Chat,2020-11-12 11:21:55.668,2020-11-12 11:22:30.743,False
...,...,...,...,...,...
233130,264,Chat,2020-12-12 00:56:22.481,2020-12-12 00:56:32.483,False
233131,264,Game,2020-12-12 00:56:37.483,2020-12-12 01:41:55.874,False
233133,264,Social,2020-12-12 01:42:05.884,2020-12-12 01:46:41.471,False
233134,264,Social,2020-12-12 01:46:46.479,2020-12-12 01:51:01.660,False


In [128]:
not_study.to_csv("DATA/CLASS_USAGExNOTSTUDY.csv")

In [135]:
ids_to_keep = list(filtered_data.id.unique().astype(int))

In [137]:
print(ids_to_keep)

[0, 1, 2, 3, 4, 5, 6, 8, 9, 15, 18, 19, 20, 21, 26, 27, 28, 30, 31, 32, 33, 34, 35, 40, 41, 42, 43, 44, 45, 50, 52, 54, 57, 58, 59, 60, 61, 65, 66, 70, 73, 74, 75, 76, 77, 79, 80, 84, 87, 89, 91, 97, 98, 99, 100, 103, 105, 107, 109, 111, 112, 113, 114, 118, 119, 121, 126, 128, 130, 131, 132, 134, 136, 141, 144, 146, 148, 155, 157, 158, 160, 161, 162, 163, 165, 166, 169, 174, 176, 177, 187, 190, 191, 192, 194, 197, 198, 199, 200, 202, 203, 204, 206, 207, 208, 209, 212, 213, 215, 216, 217, 218, 222, 223, 225, 226, 229, 230, 231, 234, 238, 239, 240, 244, 250, 251, 252, 253, 254, 255, 256, 258, 259, 260, 261, 262, 264, 266]
