In [None]:
from matplotlib import pyplot
import pandas as pd
import numpy as np
from datetime import datetime

def days_between(d1, d2):
    if d1 == None:
        return 0
    else:
        d1 = datetime.strptime(d1, "%Y-%m-%d")
        d2 = datetime.strptime(d2, "%Y-%m-%d")
        return abs((d2 - d1).days)

pandas_df = pd.read_csv('dataset_mood_smartphone.csv', header=0, index_col=0)
#The amount of days measured differs per patient. 
list_ids = pandas_df.id.unique()

### Make a dictionary with the ID's and the dates 

In [None]:
time_dict = {}
for i in list_ids:
    time_dict[i] = []

for index, row in pandas_df.iterrows():
    for x in list_ids:
        time_list = []
        if row['id'] == x:
            time = row['time'].split()
            time_dict[x].append(time[0])


### See how many unique dates each ID has (to determine the aggregation)

In [None]:
for key, value in time_dict.items():
    print(len(set(value)))
    l = list(set(value))
    l.sort()
# I think we should keep the aggragation small since mood is something that can change very fast even throughout the day.

72
68
77
70
74
50
67
71
67
72
72
79
74
75
73
66
63
62
75
100
80
58
73
70
78
86
101


### Check for missing values

In [None]:
#Mood doesn't seem to have missing values? 
for index, row in pandas_df.iterrows():
    if row['variable'] == 'mood' and row['value'] not in range(1,11):
        print(row)


In [None]:
#Valence seems to have 156 missing values 
mv_list = []
for index, row in pandas_df.iterrows():
    if row['variable'] == 'circumplex.valence' and row['value'] not in range(-2,3):
        mv_list.append(row['time'])

print(len(mv_list))

156


In [None]:
#Communication apps seems to have 0 missing values 
com_list = []
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.communication' and type(row['value']) != float:
        com_list.append(row['time'])

print(len(com_list))

0


In [None]:
pandas_df.isna().sum()

id            0
time          0
variable      0
value       202
dtype: int64

### Calculating the average mood per day per patient 

In [None]:
last_value = None
total_mood = 0
amount_mood = 0
average_list = []
date_list = [] 
id_list = []
unique_id_list = []
for index, row in pandas_df.iterrows():
    if row['variable'] == 'mood':
        new_value = row['time'].split()[0]
        days = days_between(last_value, new_value)
        
        if last_value != new_value and amount_mood > 0 and days == 1:
            #Make sure the data frame has unique IDs to work with (since it didn't before) 
            unique = last_value + '_' + row['id']
            id_list.append(row['id'])
            date_list.append(last_value)
            unique_id_list.append(unique)

            average = total_mood / amount_mood
            average_list.append(average)

            total_mood = 0
            amount_mood = 0
        
        total_mood += row['value'] 
        amount_mood += 1
        last_value = new_value


In [None]:
#Make a new dateframe with the average moods per day, the ids and the dates
new_df = pd.DataFrame({'unique_id':unique_id_list})
new_df['id'] = id_list
new_df['date'] = date_list
new_df['average_mood'] = average_list

### Other features

In [None]:
#Total amount of calls per day
last_value = None
amount_call = 0
call_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'call':
        new_value = row['time'].split()[0]
        days = days_between(last_value, new_value)
            
        if last_value != new_value and days == 1: 
            unique = last_value + '_' + row['id']
            call_dict[unique] = amount_call
            amount_call = 0
            
        amount_call += 1
        last_value = new_value

In [None]:
new_df['total_calls'] = new_df['unique_id'].map(call_dict)

In [None]:
#Total amount of SMS sent per day
last_value = None
amount_sms = 0
sms_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'sms':
        new_value = row['time'].split()[0]
        days = days_between(last_value, new_value)

        if last_value != new_value and days == 1: 
            unique = last_value + '_' + row['id']
            sms_dict[unique] = amount_sms
            amount_sms = 0
            
        amount_sms += 1
        last_value = new_value

In [None]:
new_df['total_sms'] = new_df['unique_id'].map(sms_dict)

In [None]:
#Average amount of time per day spent on communication apps 
last_value = None
amount_com = 0
total_com = 0
com_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.communication':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_com > 0: 
            unique_com = last_value + '_' + row['id']
            average_com = total_com / amount_com
            com_dict[unique_com] = average_com
            amount_com = 0
            total_com = 0
            
        amount_com += 1
        total_com += row['value']
        last_value = new_value

In [None]:
new_df['average_com'] = new_df['unique_id'].map(com_dict)

In [None]:
nextday = [] 
for x in range(len(average_list)):
    if x == 1215: 
        nextday.append(0)
    else: 
        nextday.append(average_list[x+1])

In [None]:
new_df['mood_next_day'] = nextday

In [None]:
new_df

Unnamed: 0,unique_id,id,date,average_mood,total_calls,total_sms,average_com,mood_next_day
0,2014-02-26_AS14.01,AS14.01,2014-02-26,6.25,,,,6.25
1,2014-03-21_AS14.01,AS14.01,2014-03-21,6.25,6.0,,55.095526,6.40
2,2014-03-22_AS14.01,AS14.01,2014-03-22,6.40,,,51.697063,6.80
3,2014-03-23_AS14.01,AS14.01,2014-03-23,6.80,,,53.442031,6.00
4,2014-03-24_AS14.01,AS14.01,2014-03-24,6.00,,,47.541687,6.75
...,...,...,...,...,...,...,...,...
1211,2014-05-26_AS14.33,AS14.33,2014-05-26,5.40,10.0,3.0,34.320472,6.20
1212,2014-05-27_AS14.33,AS14.33,2014-05-27,6.20,1.0,2.0,59.382943,8.20
1213,2014-05-28_AS14.33,AS14.33,2014-05-28,8.20,10.0,1.0,37.238750,7.00
1214,2014-05-29_AS14.33,AS14.33,2014-05-29,7.00,5.0,,54.388125,6.80


In [None]:
new_df.drop(new_df.tail(1).index,
        inplace = True)

In [None]:
#Average amount of time per day spent on social media apps 
last_value = None
amount_soc = 0
total_soc = 0
soc_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.social':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_soc > 0: 
            unique_soc = last_value + '_' + row['id']
            average_soc = total_soc / amount_soc
            soc_dict[unique_soc] = average_soc
            amount_soc = 0
            total_soc = 0
            
        amount_soc += 1
        total_soc += row['value']
        last_value = new_value

In [None]:
new_df['average_soc'] = new_df['unique_id'].map(soc_dict)

In [None]:
#Average valence of a person 
last_value = None
amount_val = 0
total_val = 0
val_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'circumplex.valence':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_val > 0: 
            unique_val = last_value + '_' + row['id']
            average_val = total_val / amount_val
            val_dict[unique_val] = average_val
            amount_val = 0
            total_val = 0
            
        amount_val += 1
        total_val += row['value']
        last_value = new_value

In [None]:
new_df['average_valence'] = new_df['unique_id'].map(val_dict)

In [None]:
#Average arousel of a person
last_value = None
amount_ar = 0
total_ar = 0
ar_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'circumplex.arousal':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_ar > 0: 
            unique_ar = last_value + '_' + row['id']
            average_ar = total_ar / amount_ar
            ar_dict[unique_ar] = average_ar
            amount_ar = 0
            total_ar = 0
            
        amount_ar += 1
        total_ar += row['value']
        last_value = new_value

In [None]:
new_df['average_arousal'] = new_df['unique_id'].map(ar_dict)

In [None]:
#Average finance app usage of a person
last_value = None
amount_fi = 0
total_fi = 0
fi_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.finance':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_fi > 0: 
            unique_fi = last_value + '_' + row['id']
            average_fi = total_fi / amount_fi
            fi_dict[unique_fi] = average_fi
            amount_fi = 0
            total_fi = 0
            
        amount_fi += 1
        total_fi += row['value']
        last_value = new_value

In [None]:
new_df['average_finance'] = new_df['unique_id'].map(fi_dict)

In [None]:
#Average office app usage of a person
last_value = None
amount_of = 0
total_of = 0
of_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.office':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_of > 0: 
            unique_of = last_value + '_' + row['id']
            average_of = total_of / amount_of
            of_dict[unique_of] = average_of
            amount_of = 0
            total_of = 0
            
        amount_of += 1
        total_of += row['value']
        last_value = new_value

In [None]:
new_df['average_office'] = new_df['unique_id'].map(of_dict)

In [None]:
last_value = None
amount_ent = 0
total_ent = 0
ent_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.entertainment':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_ent > 0: 
            unique_ent = last_value + '_' + row['id']
            average_ent = total_ent / amount_ent
            ent_dict[unique_ent] = average_ent
            amount_ent = 0
            total_ent = 0
            
        amount_ent += 1
        total_ent += row['value']
        last_value = new_value

In [None]:
new_df['average_entertainment'] = new_df['unique_id'].map(ent_dict)

In [None]:
last_value = None
amount_gam = 0
total_gam = 0
gam_dict = {}
for index, row in pandas_df.iterrows():
    if row['variable'] == 'appCat.game':
        new_value = row['time'].split()[0]
            
        if last_value != new_value and amount_gam > 0: 
            unique_gam = last_value + '_' + row['id']
            average_gam = total_gam / amount_gam
            gam_dict[unique_gam] = average_gam
            amount_gam = 0
            total_gam = 0
            
        amount_gam += 1
        total_gam += row['value']
        last_value = new_value

In [None]:
new_df['average_game'] = new_df['unique_id'].map(gam_dict)

In [None]:
new_df.to_csv('new_data.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d79defa5-2359-4699-82dc-d6bf5eddd7a7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>