In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


# read csv file
sdf = pd.read_csv('data/dataset_mood_smartphone.csv', index_col=0)

# drop rows where there's no number in column named 'value'
# sdf.dropna()

# retrieve index for which column named 'value' is negative
index_neg = sdf[sdf['value'] < 0].index

# drop those indexes
sdf.drop(index_neg, inplace=True)

display(sdf)

Unnamed: 0,id,time,variable,value
1,AS14.01,2014-02-26 13:00:00.000,mood,6.000
2,AS14.01,2014-02-26 15:00:00.000,mood,6.000
3,AS14.01,2014-02-26 18:00:00.000,mood,6.000
4,AS14.01,2014-02-26 21:00:00.000,mood,7.000
5,AS14.01,2014-02-27 09:00:00.000,mood,6.000
...,...,...,...,...
2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032
2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008
2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026
2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033


In [2]:
def timechange(sdf):
    sdf["time"] = pd.to_datetime(sdf["time"])
    sdf["hour"] = sdf["time"].dt.hour
    sdf["time"] = sdf["time"].dt.date

    return sdf
sdf = timechange(sdf)
display(sdf)



Unnamed: 0,id,time,variable,value,hour
1,AS14.01,2014-02-26,mood,6.000,13
2,AS14.01,2014-02-26,mood,6.000,15
3,AS14.01,2014-02-26,mood,6.000,18
4,AS14.01,2014-02-26,mood,7.000,21
5,AS14.01,2014-02-27,mood,6.000,9
...,...,...,...,...,...
2770399,AS14.30,2014-04-11,appCat.weather,8.032,7
2772465,AS14.30,2014-04-19,appCat.weather,3.008,11
2774026,AS14.30,2014-04-26,appCat.weather,7.026,10
2774133,AS14.30,2014-04-27,appCat.weather,23.033,0


In [3]:
# create sdf with statistical values from csv 
def features(sdf):
    # create list in order to store values in for loop
    base_stats = []
    # loop through rows of sdf
    for var in sdf.variable.unique(): # var are the distinct features in the column named 'variable'
        subsetseries = sdf[sdf["variable"] == var].value # subsetseries are the numbers given in column named 'value'
        description_dict = subsetseries.describe().round(2).to_dict() # description_dict is returning a dict consisting of count, mean, std, min, 25%, 50%, 75%, and max values          
        description_dict['trimmed mean'] = stats.trim_mean(subsetseries, 0.05) 
        description_dict["variable"] = var
        base_stats.append(description_dict)
    # create new sdf
    base_stats_sdf = pd.DataFrame(base_stats)
    base_stats_sdf = base_stats_sdf.set_index("variable")

    return base_stats_sdf
print(features(sdf))

                        count    mean     std   min    25%    50%     75%  \
variable                                                                    
mood                   5641.0    6.99    1.03  1.00   7.00   7.00    8.00   
circumplex.arousal     3464.0    0.61    0.61  0.00   0.00   1.00    1.00   
circumplex.valence     5167.0    0.80    0.51  0.00   0.00   1.00    1.00   
activity              22965.0    0.12    0.19  0.00   0.00   0.02    0.16   
screen                96578.0   75.34  253.82  0.04   5.32  20.04   62.54   
call                   5239.0    1.00    0.00  1.00   1.00   1.00    1.00   
sms                    1798.0    1.00    0.00  1.00   1.00   1.00    1.00   
appCat.builtin        91285.0   19.45  312.91  0.00   2.02   4.04    9.92   
appCat.communication  74276.0   43.34  128.91  0.01   5.22  16.23   45.48   
appCat.entertainment  27124.0   37.58  262.97  0.00   1.33   3.39   14.92   
appCat.finance          939.0   21.76   39.22  0.13   4.07   8.03   20.16   

In [4]:
# create sdf with statistical values from csv when appCat and circumplex values are summed
def features_aggr(sdf, aggr_features=False):


    # sum values for appCat and circumplex variables
    if aggr_features:
        sdf['variable'] = sdf['variable'].apply(lambda x: 'appCat.values' if 'appCat' in x else x)
        sdf['variable'] = sdf['variable'].apply(lambda x: 'circumplex.values' if 'circumplex' in x else x)

    # create list in order to store values in for loop
    statistics = []

    # loop through rows of sdf
    for var in sdf.variable.unique(): # var are the distinct features in the column named 'variable'
        col_val = sdf[sdf["variable"] == var].value # subsetseries are the numbers given in column named 'value'
        description_dict = col_val.describe().round(2).to_dict() # description_dict is returning a dict consisting of count, mean, std, min, 25%, 50%, 75%, and max values          
        description_dict['trimmed mean'] = stats.trim_mean(col_val, 0.05) 
        description_dict["variable"] = var
        statistics.append(description_dict)

    # create a new sdf
    statistics_sdf = pd.DataFrame(statistics)
    statistics_sdf = statistics_sdf.set_index("variable")

    return statistics_sdf
print(features_aggr(sdf, aggr_features=True))


                      count   mean     std   min   25%    50%    75%  \
variable                                                               
mood                 5641.0   6.99    1.03  1.00  7.00   7.00   8.00   
circumplex.values    8631.0   0.72    0.56  0.00  0.00   1.00   1.00   
activity            22965.0   0.12    0.19  0.00  0.00   0.02   0.16   
screen              96578.0  75.34  253.82  0.04  5.32  20.04  62.54   
call                 5239.0   1.00    0.00  1.00  1.00   1.00   1.00   
sms                  1798.0   1.00    0.00  1.00  1.00   1.00   1.00   
appCat.values      233401.0  34.59  253.03  0.00  3.01   7.04  25.50   

                        max  trimmed mean  
variable                                   
mood                  10.00      7.033090  
circumplex.values      2.00      0.723054  
activity               1.00      0.090174  
screen              9867.01     41.731878  
call                   1.00      1.000000  
sms                    1.00      1.000000  

In [5]:
# calculate average value per variable per day
display(sdf.groupby(['id', 'time', 'variable'])['value'].mean().to_frame().reset_index())


Unnamed: 0,id,time,variable,value
0,AS14.01,2014-02-17,call,1.000000
1,AS14.01,2014-02-18,call,1.000000
2,AS14.01,2014-02-19,call,1.000000
3,AS14.01,2014-02-19,sms,1.000000
4,AS14.01,2014-02-20,call,1.000000
...,...,...,...,...
8203,AS14.33,2014-05-30,circumplex.values,0.500000
8204,AS14.33,2014-05-30,mood,6.800000
8205,AS14.33,2014-05-30,screen,86.342245
8206,AS14.33,2014-05-31,circumplex.values,1.000000


In [6]:
import numpy as np

# store values in here
mood_days = []
moodless_days = []

# separate users
for users in sdf.id.unique():
    # loop through individual users
    # see: https://datatofish.com/select-rows-pandas-dataframe/
    user_rows = sdf.loc[sdf.id == users]


    days_with_mood = 0
    days_without_mood = 0
    
    # iterate through rows for one user only
    for i, day in user_rows.iterrows():
        if day['variable'] == 'mood':
            days_with_mood += 1
        else:
            days_without_mood += 1
    mood_days.append({users: days_with_mood})
    moodless_days.append({users: days_without_mood})

print("mood_days: ")
print(mood_days)
print("moodless_days: ")
print(moodless_days)





mood_days: 
[{'AS14.01': 222}, {'AS14.02': 159}, {'AS14.03': 221}, {'AS14.05': 241}, {'AS14.06': 203}, {'AS14.07': 192}, {'AS14.08': 299}, {'AS14.09': 197}, {'AS14.12': 185}, {'AS14.13': 244}, {'AS14.14': 164}, {'AS14.15': 269}, {'AS14.16': 231}, {'AS14.17': 237}, {'AS14.19': 221}, {'AS14.20': 202}, {'AS14.23': 159}, {'AS14.24': 238}, {'AS14.25': 131}, {'AS14.26': 329}, {'AS14.27': 199}, {'AS14.28': 169}, {'AS14.29': 187}, {'AS14.30': 224}, {'AS14.31': 198}, {'AS14.32': 128}, {'AS14.33': 192}]
moodless_days: 
[{'AS14.01': 21675}, {'AS14.02': 14295}, {'AS14.03': 14126}, {'AS14.05': 15379}, {'AS14.06': 17766}, {'AS14.07': 15738}, {'AS14.08': 7455}, {'AS14.09': 10624}, {'AS14.12': 17065}, {'AS14.13': 19193}, {'AS14.14': 9026}, {'AS14.15': 2519}, {'AS14.16': 3646}, {'AS14.17': 15566}, {'AS14.19': 11065}, {'AS14.20': 3371}, {'AS14.23': 21643}, {'AS14.24': 14099}, {'AS14.25': 12399}, {'AS14.26': 16018}, {'AS14.27': 14284}, {'AS14.28': 19032}, {'AS14.29': 17218}, {'AS14.30': 16871}, {'AS14.31

In [37]:
# make a dict of all variables that are in the csv file
feat_dict = {
        'mood': float(0),
        'circumplex.arousal': float(0),
        'circumplex.valence': float(0),
        'activity': float(0),
        'screen': float(0),
        'sms': float(0),
        'appCat.builtin': float(0),
        'appCat.communication': float(0),
        'appCat.entertainment': float(0),
        'appCat.finance': float(0),
        'appCat.game': float(0),
        'appCat.office': float(0),
        'appCat.other': float(0),
        'appCat.social': float(0),
        'appCat.travel': float(0),
        'appCat.unknown': float(0),
        'appCat.utilities': float(0),
        'appCat.weather': float(0),
    }

# calculate average value per variable per day and keep only one per variable per day left
display(sdf.groupby(['id', 'time', 'variable'])['value'].mean().to_frame().reset_index())
# display(new_sdf)

# tried to make a pivot table in order to reshape the file to make it better workable for making plots
# based on: https://pandas.pydata.org/docs/user_guide/reshaping.html
sdf_pivot = new_sdf.pivot_table(index=['id', 'time'], columns=[x for x in feat_dict.keys()], values=['value'])

# is not working unfortunately
display(sdf_pivot)

KeyError: 'mood'

In [None]:
# transposed_df = new_sdf.set_index(['id', 'time']).T.reset_index()
# transposed_df = new_sdf.set_index(['id', 'time']).stack().reset_index()
# new_sdf["idx", 'time'] = new_sdf.groupby(["id", 'time']).cumcount()
# transposed_df = new_sdf.pivot("idx", "id", "time").reset_index(drop=True).rename_axis(columns=None)
# dict_features = feat_dict()
# for key in dict_features:
#     print(key)