### Weekly Timeline Alignment for 4 OutcomeTables 
#### This notebook adds addtional feature goal_num (stage of study) to cleaned version of outcome tables 

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [19]:
#import cleaned tables 
df_food = pd.read_csv('Outliers/df_food_outliers.csv') 
df_pa =  pd.read_csv('Result/MVPA/pa_combined.csv') 
df_pa_16k = pd.read_csv('Result/MVPA/pa_combined_16k.csv') 
df_pa_20k = pd.read_csv('Result/MVPA/pa_combined_20k.csv') 
df_sed =  pd.read_csv('Result/sed_clean.csv') 
df_user = pd.read_csv('Raw Data/users.csv') 
df_goal = pd.read_csv('Raw Data/goals.csv') 
df_demo = pd.read_csv('Result/Demographic.csv') 

In [20]:
df_pa.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_app,pa_minute_shimmer,MVPA_min,study_id,gender,MVPA_min_outlier
0,1,2012-09-06,20.0,0.0,20.0,1436,1,0
1,1,2012-09-07,20.0,2.0,22.0,1436,1,0
2,1,2012-09-10,20.0,0.0,20.0,1436,1,0
3,1,2012-09-12,20.0,0.0,20.0,1436,1,0
4,1,2012-09-16,20.0,4.0,24.0,1436,1,0


### Merge all outcome variables

In [21]:
#unify all time format to month/day/year (mm/dd/yyyy)
df_sed['upload_time'] = pd.to_datetime(df_sed.upload_time)
df_food['upload_time'] = pd.to_datetime(df_food.upload_time)
df_pa['upload_time'] = pd.to_datetime(df_pa.upload_time)
df_pa_16k['upload_time'] = pd.to_datetime(df_pa_16k.upload_time)
df_pa_20k['upload_time'] = pd.to_datetime(df_pa_20k.upload_time)

In [22]:
#columb of interest (sed)
df_sed = df_sed[['study_id','upload_time','sed_total','sed_total_outlier']]

In [23]:
#columb of interest (food)
df_food = df_food.drop(['gender'], axis=1)

In [24]:
#columb of interest (pa)
df_pa = df_pa[['study_id', 'pa_minute_app', 'pa_minute_shimmer','upload_time', 'MVPA_min','MVPA_min_outlier']]
df_pa_16k = df_pa_16k[['study_id', 'pa_minute_app', 'pa_minute_shimmer','upload_time', 'MVPA_min','MVPA_min_outlier']]
df_pa_20k = df_pa_20k[['study_id', 'pa_minute_app', 'pa_minute_shimmer','upload_time', 'MVPA_min','MVPA_min_outlier']]

In [25]:
#merge all
result = pd.merge(df_sed, df_food, how='outer', on=['study_id', 'upload_time'])
result_all = pd.merge(result, df_pa, how='outer', on=['study_id', 'upload_time'])
result_16k = pd.merge(result, df_pa_16k, how='outer', on=['study_id', 'upload_time'])
result_20k = pd.merge(result, df_pa_20k, how='outer', on=['study_id', 'upload_time'])

#fill nah with 999999
result_all = result_all.fillna(999999)
result_16k = result_16k.fillna(999999)
result_20k = result_20k.fillna(999999)

### Starting Time Alignment (goal key)

In [26]:
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()
print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


In [27]:
#align goal id with study id
df_user= df_user[df_user['user_id'].isin(id_list)]
df_goal= df_goal[df_goal['user_id'].isin(id_list)]
df_align = df_user[['user_id', 'study_id']]
df_goal = pd.merge(df_goal, df_align, how='inner', on=['user_id'])
df_goal = df_goal[['study_id','goal_start_date', 'periodname','period_num']]
df_goal['period_num'] = df_goal['period_num'] + 1

In [28]:
studyIds = list(set(list(df_goal['study_id'])))
goal_start = ['1900-01-01']*len(studyIds)
periodname = ['baseline']*len(studyIds)
period_num = [0]*len(studyIds)
d = {'study_id': studyIds, 'goal_start_date': goal_start, 'periodname': periodname, 'period_num':period_num}
df_baseline = pd.DataFrame(data=d)

df_goal = df_goal.append(df_baseline)
df_goal = df_goal.sort_values(by=['study_id','period_num'])

In [29]:
df_goal['goal_start_date'] = pd.to_datetime(df_goal.goal_start_date)
df_goal

Unnamed: 0,study_id,goal_start_date,periodname,period_num
147,1383.0,1900-01-01,baseline,0
156,1383.0,2012-10-15,intervention,1
157,1383.0,2012-10-29,intervention,2
158,1383.0,2012-11-12,intervention,3
159,1383.0,2012-11-26,intervention,4
...,...,...,...,...
2751,7831.0,2014-12-29,followup,9
2752,7831.0,2015-01-05,maintenance,10
2753,7831.0,2015-03-30,followup,11
2754,7831.0,2015-04-06,maintenance,12


In [30]:
#align the goal key to upload time (all)

periodnumList = []
for rows in range(result_all.shape[0]):
    df_sub = df_goal[df_goal['study_id'] == result_all['study_id'][rows]]
    index = 0
    for i in list(df_sub['goal_start_date']):
        if (result_all['upload_time'][rows] >= i):
            index = index + 1
        else:
            break
    index = index - 1
    periodnum = list(df_sub['period_num'])[index]
    periodnumList.append(periodnum)

result_all['goal'] = periodnumList

In [31]:
#align the goal key to upload time (16k cutoff)

periodnumList = []
for rows in range(result_16k.shape[0]):
    df_sub = df_goal[df_goal['study_id'] == result_16k['study_id'][rows]]
    index = 0
    for i in list(df_sub['goal_start_date']):
        if (result_16k['upload_time'][rows] >= i):
            index = index + 1
        else:
            break
    index = index - 1
    periodnum = list(df_sub['period_num'])[index]
    periodnumList.append(periodnum)

result_16k['goal'] = periodnumList

In [32]:
#align the goal key to upload time (20k cutoff)

periodnumList = []
for rows in range(result_20k.shape[0]):
    df_sub = df_goal[df_goal['study_id'] == result_20k['study_id'][rows]]
    index = 0
    for i in list(df_sub['goal_start_date']):
        if (result_20k['upload_time'][rows] >= i):
            index = index + 1
        else:
            break
    index = index - 1
    periodnum = list(df_sub['period_num'])[index]
    periodnumList.append(periodnum)

result_20k['goal'] = periodnumList

### Merge demographic info

In [33]:
# add condition (group)
df_condition = df_user[['study_id','cond']]
result_all = pd.merge(result_all, df_condition, how='inner', on=['study_id'])
result_16k = pd.merge(result_16k, df_condition, how='inner', on=['study_id'])
result_20k = pd.merge(result_20k, df_condition, how='inner', on=['study_id'])

In [34]:
# merge demographic info
result_all = pd.merge(result_all, df_demo, how='inner', on=['study_id'])
result_16k = pd.merge(result_16k, df_demo, how='inner', on=['study_id'])
result_20k = pd.merge(result_20k, df_demo, how='inner', on=['study_id'])

In [44]:
#rename and reorder
renames = ['study_id',
 'upload_time',
 'sed_min',
 'sed_min_outlier',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol',
 'fv_credit',
 'calories_outlier',
 'fv_outlier',
 'fat_outlier',
 'pa_min_app',
 'pa_min_shimmer',
 'pa_min',
 'pa_min_outlier',
 'goal',
 'cond',
 'date_of_birth',
 'age',
 'sex',
 'relstatus',
 'edlevel',
 'income',
 'employ',
 'race',
 'ethnicity']

result_all.columns = renames
result_16k.columns = renames
result_20k.columns = renames

In [45]:
reoder = ['study_id',
 'upload_time',
 'goal',
 'cond',
 'date_of_birth',
 'age',
 'sex',
 'relstatus',
 'edlevel',
 'income',
 'employ',
 'race',
 'ethnicity',
 'sed_min', 
 'pa_min_app',
 'pa_min_shimmer',
 'pa_min',
 'calories',
 'protein',
 'total_fat',
 'total_carbohydrate',
 'sugars',
 'fiber',
 'calcium',
 'sodium',
 'saturated_fatty_acids',
 'cholesterol',
 'fv_credit',
 'calories_outlier',
 'fv_outlier',
 'fat_outlier',
 'sed_min_outlier',
 'pa_min_outlier']

result_all = result_all[reoder]
result_16k = result_16k[reoder]
result_20k = result_20k[reoder]

In [46]:
#save results
result_all.to_csv('Result/Final/MBC2_all.csv', index=False)
result_16k.to_csv('Result/Final/MBC2_16k_cutoff.csv', index=False)
result_20k.to_csv('Result/Final/MBC2_20k_cutoff.csv', index=False)