### Weekly Timeline Alignment for 4 OutcomeTables 
#### This notebook adds addtional feature goal_num (stage of study) to cleaned version of outcome tables 

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
#import cleaned tables 
df_food = pd.read_csv('Result/Food/food_final.csv') 
df_pa =  pd.read_csv('Result/MVPA/MVPA.csv') 
df_sed =  pd.read_csv('Result/sed_clean.csv') 
df_user = pd.read_csv('Raw Data/users.csv') 
df_goal = pd.read_csv('Raw Data/goals.csv') 
df_demo = pd.read_csv('Result/Demographic.csv') 

### Merge all outcome variables

In [3]:
#unify all time format to month/day/year (mm/dd/yyyy)
df_sed['upload_time'] = pd.to_datetime(df_sed.upload_time)
df_food['upload_time'] = pd.to_datetime(df_food.upload_time)
df_pa['upload_time'] = pd.to_datetime(df_pa.upload_time)

In [4]:
#change user_id to study_id for PA and SED
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()

df_user= df_user[df_user['user_id'].isin(id_list)]
df_user = df_user[['user_id', 'study_id']]

#merge with PA and SED
df_pa = pd.merge(df_pa, df_user, how='inner', on=['user_id'])
df_sed = pd.merge(df_sed, df_user, how='inner', on=['user_id'])

In [5]:
#columb of interest (sed)
df_sed = df_sed[['study_id','upload_time','sed_total','sed_total_outlier']]

In [6]:
#columb of interest (food)
df_food = df_food.drop(['gender','protein','total_fat','total_carbohydrate','sugars','fiber','calciumm','sodium','cholesterol'], axis=1)

In [7]:
#merge all
result = pd.merge(df_sed, df_food, how='outer', on=['study_id', 'upload_time'])
result_all = pd.merge(result, df_pa, how='outer', on=['study_id', 'upload_time'])

#fill nah with 999999
result_all = result_all.fillna(999999)

### Starting Time Alignment (goal key)

In [8]:
#align goal id with study id
df_user= df_user[df_user['user_id'].isin(id_list)]
df_goal= df_goal[df_goal['user_id'].isin(id_list)]
df_align = df_user[['user_id', 'study_id']]
df_goal = pd.merge(df_goal, df_align, how='inner', on=['user_id'])
df_goal = df_goal[['study_id','goal_start_date', 'periodname','period_num']]
df_goal['period_num'] = df_goal['period_num'] + 1

In [9]:
studyIds = list(set(list(df_goal['study_id'])))
goal_start = ['1900-01-01']*len(studyIds)
periodname = ['baseline']*len(studyIds)
period_num = [0]*len(studyIds)
d = {'study_id': studyIds, 'goal_start_date': goal_start, 'periodname': periodname, 'period_num':period_num}
df_baseline = pd.DataFrame(data=d)

df_goal = df_goal.append(df_baseline)
df_goal = df_goal.sort_values(by=['study_id','period_num'])

In [10]:
df_goal['goal_start_date'] = pd.to_datetime(df_goal.goal_start_date)
df_goal

Unnamed: 0,study_id,goal_start_date,periodname,period_num
147,1383.0,1900-01-01,baseline,0
156,1383.0,2012-10-15,intervention,1
157,1383.0,2012-10-29,intervention,2
158,1383.0,2012-11-12,intervention,3
159,1383.0,2012-11-26,intervention,4
...,...,...,...,...
2751,7831.0,2014-12-29,followup,9
2752,7831.0,2015-01-05,maintenance,10
2753,7831.0,2015-03-30,followup,11
2754,7831.0,2015-04-06,maintenance,12


In [11]:
#align the goal key to upload time (all)

periodnumList = []
for rows in range(result_all.shape[0]):
    df_sub = df_goal[df_goal['study_id'] == result_all['study_id'][rows]]
    index = 0
    for i in list(df_sub['goal_start_date']):
        if (result_all['upload_time'][rows] >= i):
            index = index + 1
        else:
            break
    index = index - 1
    periodnum = list(df_sub['period_num'])[index]
    periodnumList.append(periodnum)

result_all['goal'] = periodnumList

### Merge demographic info

In [13]:
# add condition (group)
df_user = pd.read_csv('Raw Data/users.csv') 
df_condition = df_user[['study_id','cond']]
result_all = pd.merge(result_all, df_condition, how='inner', on=['study_id'])

# drop user_id
result_all = result_all.drop(['user_id'], axis=1)

In [14]:
# merge demographic info
result_all = pd.merge(result_all, df_demo, how='inner', on=['study_id'])

# drop date of birth
result_all = result_all.drop(['sbl_dem_date_of_birth'], axis=1)

In [15]:
#rename and reorder
renames = ['study_id',
 'upload_time',
 'sed_min',
 'sed_min_outlier',
 'calories',
 'saturated_fatty_acids',
 'fv_credit',
 'calories_outlier',
 'fv_outlier',
 'fat_outlier',
 'pa_min_app',
 'pa_min_shimmer',
 'pa_min',
 'pa_min_app_16k',
 'pa_min_shimmer_16k',
 'pa_min_16k',
 'pa_min_app_20k',
 'pa_min_shimmer_20k',
 'pa_min_20k',
 'pa_min_outlier',
 'pa_min_outlier_16k',
 'pa_min_outlier_20k',
 'goal',
 'cond',
 'age',
 'sex',
 'relstatus',
 'edlevel',
 'income',
 'employ',
 'race',
 'ethnicity']

result_all.columns = renames

In [16]:
reoder = ['study_id',
 'upload_time',
 'goal',
 'cond',
 'age',
 'sex',
 'relstatus',
 'edlevel',
 'income',
 'employ',
 'race',
 'ethnicity',
 'sed_min', 
 'pa_min_app',
 'pa_min_shimmer',
 'pa_min',
 'pa_min_app_16k',
 'pa_min_shimmer_16k',
 'pa_min_16k',
 'pa_min_app_20k',
 'pa_min_shimmer_20k',
 'pa_min_20k',
 'calories',
 'saturated_fatty_acids',
 'fv_credit',
 'sed_min_outlier',
 'pa_min_outlier',
 'pa_min_outlier_16k',
 'pa_min_outlier_20k',
 'calories_outlier',
 'fat_outlier',
 'fv_outlier']

result_all = result_all[reoder]

### EDA Part
#### Report on what participants have upload time not in baseline period

In [19]:
abnormal = result_all[result_all['goal']==11]
abnormal_IDs = list(set(abnormal['study_id']))

In [21]:
abnormal

Unnamed: 0,study_id,upload_time,goal,cond,age,sex,relstatus,edlevel,income,employ,...,calories,saturated_fatty_acids,fv_credit,sed_min_outlier,pa_min_outlier,pa_min_outlier_16k,pa_min_outlier_20k,calories_outlier,fat_outlier,fv_outlier
171,1640.0,2013-05-28,11,2,37,2,5,9,9,1,...,104.940000,0.05968,4.690,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,1640.0,2013-05-28,11,2,37,2,5,9,9,1,...,104.940000,0.05968,4.690,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,1640.0,2013-05-28,11,2,37,2,5,9,9,1,...,104.940000,0.05968,4.690,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174,1640.0,2013-05-28,11,2,37,2,5,9,9,1,...,104.940000,0.05968,4.690,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,1640.0,2013-05-28,11,2,37,2,5,9,9,1,...,104.940000,0.05968,4.690,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246204,7661.0,2015-03-14,11,2,43,2,1,8,4,2,...,299.760000,0.08563,6.335,999999.0,999999.0,999999.0,999999.0,0.0,0.0,0.0
246308,5047.0,2015-01-19,11,1,28,2,1,2,3,1,...,877.500000,13.00000,0.000,999999.0,0.0,0.0,0.0,0.0,0.0,0.0
246309,5047.0,2015-01-20,11,1,28,2,1,2,3,1,...,2142.500000,36.00000,0.000,999999.0,0.0,0.0,0.0,0.0,0.0,0.0
246310,5047.0,2015-01-21,11,1,28,2,1,2,3,1,...,1612.879747,14.50000,0.000,999999.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
### goal > 11
abnormal_2 = result_all[result_all['goal']>11]
abnormal_2

Unnamed: 0,study_id,upload_time,goal,cond,age,sex,relstatus,edlevel,income,employ,...,calories,saturated_fatty_acids,fv_credit,sed_min_outlier,pa_min_outlier,pa_min_outlier_16k,pa_min_outlier_20k,calories_outlier,fat_outlier,fv_outlier
183,1640.0,2013-06-24,12,2,37,2,5,9,9,1,...,385.120000,2.933820,9.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0
184,1640.0,2013-06-26,12,2,37,2,5,9,9,1,...,882.500000,7.219500,2.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
185,1640.0,2013-06-27,12,2,37,2,5,9,9,1,...,999999.000000,999999.000000,999999.00,0.0,0.0,0.0,0.0,999999.0,999999.0,999999.0
186,1640.0,2013-07-02,12,2,37,2,5,9,9,1,...,999999.000000,999999.000000,999999.00,0.0,1.0,1.0,1.0,999999.0,999999.0,999999.0
187,1640.0,2013-07-03,12,2,37,2,5,9,9,1,...,999999.000000,999999.000000,999999.00,0.0,0.0,0.0,0.0,999999.0,999999.0,999999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245327,6439.0,2015-03-03,12,2,43,2,1,4,4,1,...,1065.230000,3.733860,6.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245636,7039.0,2015-02-23,12,3,50,2,1,4,6,1,...,161.110000,0.164780,9.50,0.0,999999.0,999999.0,999999.0,0.0,0.0,0.0
245637,7039.0,2015-02-24,12,3,50,2,1,4,6,1,...,161.110000,0.164780,9.50,0.0,999999.0,999999.0,999999.0,0.0,0.0,0.0
246205,7661.0,2015-03-17,12,2,43,2,1,8,4,2,...,473.160000,0.180550,10.66,999999.0,999999.0,999999.0,999999.0,0.0,0.0,0.0


In [23]:
goal_12 =  abnormal_2[abnormal_2['goal']==12]
abnormal_IDs = list(set(goal_12['study_id']))
len(abnormal_IDs)

102

In [24]:
goal_13 =  abnormal_2[abnormal_2['goal']==13]
abnormal_IDs = list(set(goal_13['study_id']))
len(abnormal_IDs)

9

In [43]:
### Descriptive table for days of records by ‘Goal’
result_all.head(5)

count_table = ps.sqldf("SELECT study_id, goal, count(upload_time) as count from result_all group by goal, study_id order by study_id")
count_table

Unnamed: 0,study_id,goal,count
0,1383,1,14
1,1383,2,14
2,1383,3,14
3,1383,4,12
4,1383,5,13
...,...,...,...
1888,7831,6,2
1889,7831,7,2
1890,7831,8,15
1891,7831,9,4


In [64]:
des_table = ps.sqldf("SELECT goal, avg(count) as mean,  min(count) as min , max(count) as max from count_table group by goal")
des_table

Unnamed: 0,goal,mean,min,max
0,1,12.744048,2,16
1,2,12.944785,4,14
2,3,12.570552,4,15
3,4,12.388889,2,18
4,5,12.348101,1,14
5,6,12.481481,1,87
6,7,6.075269,1,7
7,8,56.869822,1,95
8,9,6.0,1,7
9,10,57.661972,1,91


In [61]:
len(list(result_all[result_all['goal']==1]['study_id']))

2141

In [27]:
#remove goal > 11
result_all = result_all[result_all['goal']<12]

In [28]:
#save results
result_all.to_csv('Result/Final/MBC2_all.csv', index=False)