In [1]:
import pandas as pd
import datetime 
import pandasql as ps
from plotly import graph_objs as go
import numpy as np

In [50]:
df = pd.read_excel('Raw Data/Engaged Daily Merged v12.xlsx')
columns = ['ID', 'Study_Day', 'pounds', 'total_calories', 'active_minutes', 'pa_manual', 'Activity_total']
df = df[columns]
df.columns = ['ID', 'date', 'pounds', 'total_calories', 'active_minutes', 'pa_manual', 'Activity_total']
df.head(5)

Unnamed: 0,ID,date,pounds,total_calories,active_minutes,pa_manual,Activity_total
0,102,1,172.0,1157.485902,0.0,0.0,0.0
1,102,2,172.0,1008.985901,11.0,45.0,56.0
2,102,3,171.0,1048.671,0.0,0.0,0.0
3,102,4,170.0,1223.696803,0.0,0.0,0.0
4,102,5,170.0,1107.966673,0.0,20.0,20.0


In [51]:
df_weight = df[['date', 'pounds']]
df_weight = df_weight[pd.to_numeric(df_weight['pounds'], errors='coerce').notnull()]
df_weight = df_weight[df_weight.pounds != 0]
df_weight.groupby(by=['date']).count().to_csv('check/1.csv')


df_cal = df[['date', 'total_calories']]
df_cal = df_cal[pd.to_numeric(df_cal['total_calories'], errors='coerce').notnull()]
df_cal.groupby(by=['date']).count().to_csv('check/2.csv')


df_active_minutes = df[['date', 'active_minutes']]
df_active_minutes = df_active_minutes[pd.to_numeric(df_active_minutes['active_minutes'], errors='coerce').notnull()]
df_active_minutes.groupby(by=['date']).count().to_csv('check/3.csv')


df_pa_manual = df[['date', 'pa_manual']]
df_pa_manual = df_pa_manual[pd.to_numeric(df_pa_manual['pa_manual'], errors='coerce').notnull()]
df_pa_manual.groupby(by=['date']).count().to_csv('check/4.csv')

df_Activity_total = df[['date', 'Activity_total']]
df_Activity_total = df_Activity_total[pd.to_numeric(df_Activity_total['Activity_total'], errors='coerce').notnull()]
df_Activity_total.groupby(by=['date']).count().to_csv('check/5.csv')

In [52]:
df_1 = pd.read_csv('check/1.csv')
df_2 = pd.read_csv('check/2.csv')
df_3 = pd.read_csv('check/3.csv')
df_4 = pd.read_csv('check/4.csv')
df_5 = pd.read_csv('check/5.csv')

#outter join all
df_all = pd.merge(df_1, df_2, how="outer", on=['date'])
df_all = pd.merge(df_all, df_3, how="outer", on=['date'])
df_all = pd.merge(df_all, df_4, how="outer", on=['date'])
df_all = pd.merge(df_all, df_5, how="outer", on=['date'])

In [53]:
df_all

Unnamed: 0,date,pounds,total_calories,active_minutes,pa_manual,Activity_total
0,1,29.0,29,30,30,30
1,2,29.0,29,30,30,30
2,3,29.0,28,30,30,30
3,4,29.0,30,29,30,29
4,5,26.0,25,30,30,30
...,...,...,...,...,...,...
179,180,6.0,5,7,7,7
180,181,6.0,5,7,7,7
181,182,4.0,4,5,5,5
182,183,3.0,4,5,5,5


In [57]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['pounds'], mode='lines', name='weight count'))
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['total_calories'], mode='lines', name='calories'))
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['active_minutes'], mode='lines', name='PA shimmer'))
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['pa_manual'], mode='lines', name='PA self-report'))
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['Activity_total'], mode='lines', name='PA total'))
fig.update_layout(title_text='Outcome Count Plot by Time', yaxis_title='Count', xaxis_title='Study Day')
fig.write_html("ENGAGED_count.html")

In [58]:
df_all.to_csv('check/ENGAGED_count_time.csv', index = False)

In [17]:
def Stats(df, column, IQR_range):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    Max = np.max(l)
    Min = np.min(l)
    Mean = np.mean(l)
    Median = np.median(l)
    STD = np.std(l)
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_range*IQR
    high = q3 + IQR_range*IQR

    outliersPercent = len([i for i in l if i < low or i > high]) / len(l) * 100
    return([Min, Max, Mean, Median, STD, IQR_range, low, high, outliersPercent])

In [31]:
df_shimmer = pd.read_csv('shimmer_min_compare.csv')
df_shimmer = df_shimmer.fillna(0)

df_shimmer['diff_no_cutoff'] = df_shimmer['active_minutes'] - df_shimmer['shimmer_no_cutoff'] 
df_shimmer['diff_16k_cutoff'] = df_shimmer['active_minutes'] - df_shimmer['shimmer_16k_cutoff'] 
df_shimmer['diff_20k_cutoff'] = df_shimmer['active_minutes'] - df_shimmer['shimmer_20k_cutoff'] 

In [40]:
# (1) pervasiveness of difference (frequency table with: (1) average(median sd range etc...) 
# number of rows/observations with difference != 0
df_diff_no = df_shimmer[df_shimmer['diff_no_cutoff']!=0]
df_diff_16k = df_shimmer[df_shimmer['diff_16k_cutoff']!=0]
df_diff_20k = df_shimmer[df_shimmer['diff_20k_cutoff']!=0]

df_diff_no.head(5)

Unnamed: 0,ID,date,active_minutes,shimmer_no_cutoff,shimmer_16k_cutoff,shimmer_20k_cutoff,diff_no_cutoff,diff_16k_cutoff,diff_20k_cutoff
2,102.0,2011-09-16,0.0,4.0,4.0,4.0,-4.0,-4.0,-4.0
3,102.0,2011-09-17,0.0,11.0,11.0,11.0,-11.0,-11.0,-11.0
7,102.0,2011-09-21,16.0,23.0,23.0,23.0,-7.0,-7.0,-7.0
9,102.0,2011-09-23,52.0,48.0,48.0,48.0,4.0,4.0,4.0
10,102.0,2011-09-24,0.0,3.0,3.0,3.0,-3.0,-3.0,-3.0


In [41]:
df_diff_count_0 = ps.sqldf("select ID as ID, count(*) as Diff_Count_0 from df_diff_no group by ID")

df_diff_count_16k = ps.sqldf("select ID as ID, count(*) as Diff_Count_16k from df_diff_16k group by ID")

df_diff_count_20k = ps.sqldf("select ID as ID, count(*) as Diff_Count_20k from df_diff_20k group by ID")

df_count = ps.sqldf("select ID as ID, count(*) as Total_Count from df_shimmer group by ID")

In [42]:
l1 = ['Number of different rows (0 cutoff)']  + Stats(df_diff_count_0, 'Diff_Count_0', 1.5)
l2 = ['Number of different rows (16k cutoff)']  + Stats(df_diff_count_16k, 'Diff_Count_16k', 1.5)
l3 = ['Number of different rows (20k cutoff)']  + Stats(df_diff_count_20k, 'Diff_Count_20k', 1.5)
df_all = pd.DataFrame([l1,l2,l3])
df_all.columns= ['Outcome', 'Min', 'Max', 'Mean', 'Median', 'STD', 'IQR_range', 'Low', 'High', 'Outliers Percentage %']
df_all

Unnamed: 0,Outcome,Min,Max,Mean,Median,STD,IQR_range,Low,High,Outliers Percentage %
0,Number of different rows (0 cutoff),8.0,169.0,81.375,85.0,46.930634,1.5,-91.375,245.625,0.0
1,Number of different rows (16k cutoff),8.0,169.0,81.25,85.0,46.800107,1.5,-91.375,245.625,0.0
2,Number of different rows (20k cutoff),8.0,169.0,81.25,85.0,46.800107,1.5,-91.375,245.625,0.0


In [43]:
result_1 = pd.merge(df_diff_count_0, df_diff_count_16k, how="inner", on=["ID"])
result_2 = pd.merge(result_1, df_diff_count_20k, how="inner", on=["ID"])
result_3 = pd.merge(result_2, df_count, how="inner", on=["ID"])
result_3['Diff_Percentage'] = result_3['Diff_Count_0'] / result_3['Total_Count'] * 100
result_3.to_csv('Engaged_shimmer_diff_count.csv', index=False)
result_3.head(5)

Unnamed: 0,ID,Diff_Count_0,Diff_Count_16k,Diff_Count_20k,Total_Count,Diff_Percentage
0,0.0,21,21,21,21,100.0
1,102.0,99,99,99,175,56.571429
2,103.0,155,154,154,183,84.699454
3,109.0,131,129,129,178,73.595506
4,110.0,129,129,129,177,72.881356


In [46]:
# (2) old and new shimmer count values by (1) negative difference and (2) positive difference
df_diff_no_pos = df_shimmer[df_shimmer['diff_no_cutoff']>0]
df_diff_no_neg = df_shimmer[df_shimmer['diff_no_cutoff']<0]

df_diff_16k_pos = df_shimmer[df_shimmer['diff_16k_cutoff']>0]
df_diff_16k_neg = df_shimmer[df_shimmer['diff_16k_cutoff']<0]

df_diff_20k_pos = df_shimmer[df_shimmer['diff_20k_cutoff']>0]
df_diff_20k_neg = df_shimmer[df_shimmer['diff_20k_cutoff']<0]

l1 = ['Positive Difference (0 cutoff)']  + Stats(df_diff_no_pos, 'diff_no_cutoff', 1.5)
l2 = ['Negative Difference (0 cutoff)']  + Stats(df_diff_no_neg, 'diff_no_cutoff', 1.5)

l3 = ['Positive Difference (16k cutoff)']  + Stats(df_diff_16k_pos, 'diff_16k_cutoff', 1.5)
l4 = ['Negative Difference (16k cutoff)']  + Stats(df_diff_16k_neg, 'diff_16k_cutoff', 1.5)

l5 = ['Positive Difference (20k cutoff)']  + Stats(df_diff_20k_pos, 'diff_20k_cutoff', 1.5)
l6 = ['Negative Difference (20k cutoff)']  + Stats(df_diff_20k_neg, 'diff_20k_cutoff', 1.5)

df_all = pd.DataFrame([l1,l2,l3,l4,l5,l6])
df_all.columns= ['Outcome', 'Min', 'Max', 'Mean', 'Median', 'STD', 'IQR_range', 'Low', 'High', 'Outliers Percentage %']
df_all

Unnamed: 0,Outcome,Min,Max,Mean,Median,STD,IQR_range,Low,High,Outliers Percentage %
0,Positive Difference (0 cutoff),1.0,134.0,27.741965,23.0,20.104102,1.5,-28.0,76.0,2.662994
1,Negative Difference (0 cutoff),-70.0,-1.0,-10.667327,-8.0,8.876503,1.5,-31.5,12.5,3.30033
2,Positive Difference (16k cutoff),1.0,134.0,27.741965,23.0,20.104102,1.5,-28.0,76.0,2.662994
3,Negative Difference (16k cutoff),-70.0,-1.0,-10.671079,-8.0,8.880169,1.5,-31.5,12.5,3.309067
4,Positive Difference (20k cutoff),1.0,134.0,27.741965,23.0,20.104102,1.5,-28.0,76.0,2.662994
5,Negative Difference (20k cutoff),-70.0,-1.0,-10.675711,-8.0,8.881122,1.5,-31.5,12.5,3.309067


### Validation

In [2]:
df = pd.read_csv('Final Data/ENGAGED.csv')
df.head(2)

Unnamed: 0,ID,Tech_id,upload_time,NUBIC,age,gender,Study_Day,Study_Day2,Day_o_week,Month,...,weight_time,food_time,shimmer_no_cutoff,shimmer_16k_cutoff,shimmer_20k_cutoff,weartime_6h,weartime_8h,weartime_10h,Activity_total_16k,Activity_total_20k
0,102,14,2011-09-14,66,56,1,1,1,4,9,...,999999,999999,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
1,102,14,2011-09-15,66,56,1,2,2,5,9,...,999999,999999,11.0,11.0,11.0,0.0,0.0,0.0,56.0,56.0


In [8]:
df_count = ps.sqldf("select ID as study_id, Week as week, count(*) as count from df group by ID, Week")
df_count

Unnamed: 0,study_id,week,count
0,102,1,7
1,102,2,7
2,102,3,7
3,102,4,7
4,102,5,7
...,...,...,...
760,323,22,5
761,323,23,5
762,323,24,6
763,323,25,6


In [10]:
def Stats_1(df):
    if(df.shape[0] == 0):
        return([0,0,0,0,0])
    
    l = list(df['count'])
    Max = np.max(l)
    Min = np.min(l)
    Mean = np.mean(l)
    Median = np.median(l)
    STD = np.std(l)
    return([Mean, Median, STD, Min, Max])

In [14]:
cond_3 = []
for eachWeek in list(set(df_count['week'])):
    df_temp = df_count[df_count['week'] == eachWeek]
    cond_3.append([eachWeek] + Stats_1(df_temp))

In [16]:
df_1 = pd.DataFrame(cond_3)
  
# specifying cloumn names
df_1.columns = ['Week', 'Mean', 'Median', 'STD', 'Min', 'Max']

df_1.to_csv('1.csv',index=False)

In [23]:
df_temp = pd.pivot_table(df_count, values='count', index=['week'],
                    columns=['study_id'], aggfunc=np.sum)
df_temp = df_temp.fillna(0)
df_temp.to_csv('2.csv')