## Sedentary and MVPA Outcomes Extraction
#### This notebook extracts 212 participants' sendentary and physical activity minutes from raw dataset with daily level granularity
#### The inputs and outputs are in csv format, by participant's study ID

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#import raw tables
df_sed = pd.read_csv('Raw Data/sed_upload.csv')
df_pa_device = pd.read_csv('Raw Data/s2_counts.csv')
df_pa_app = pd.read_csv('Raw Data/manual_min_upload.csv')
df_user = pd.read_csv('Raw Data/users.csv') 

### Extract Participants (212 from intervention period)

In [3]:
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()
print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


In [4]:
# selecting rows for participants only
df_sed= df_sed[df_sed['user_id'].isin(id_list)]
df_pa_device= df_pa_device[df_pa_device['user_id'].isin(id_list)]

#reset index
df_sed = df_sed.reset_index(drop=True)
df_pa_device = df_pa_device.reset_index(drop=True)

### SED (dayp1_minute + dayp2_minute + dayp3_minute +  dayp4_minute )

In [5]:
df_sed['sed_total'] = df_sed['dayp1_minute']+df_sed['dayp2_minute']+df_sed['dayp3_minute']+df_sed['dayp4_minute']

In [6]:
sed_list_included = ['user_id', 'when_sed', 'dayp1_minute','dayp2_minute', 'dayp3_minute', 'dayp4_minute', 'sed_total']
df_sed = df_sed[sed_list_included]
df_sed.columns = ['user_id', 'upload_time', 'dayp1_minute','dayp2_minute', 'dayp3_minute', 'dayp4_minute', 'sed_total']

In [7]:
#cleaned sedentary outcome table (version 1)
df_sed['upload_time'] = pd.to_datetime(df_sed.upload_time)
df_sed_clean = ps.sqldf("SELECT user_id, upload_time, sum(sed_total) from df_sed group by user_id, upload_time")
df_sed_clean[df_sed_clean['user_id']==7]

Unnamed: 0,user_id,upload_time,sum(sed_total)
81,7,2012-08-20 00:00:00.000000,65
82,7,2012-08-21 00:00:00.000000,105
83,7,2012-08-22 00:00:00.000000,110
84,7,2012-08-23 00:00:00.000000,40
85,7,2012-08-24 00:00:00.000000,120
...,...,...,...
141,7,2013-05-14 00:00:00.000000,15
142,7,2013-05-16 00:00:00.000000,30
143,7,2013-06-04 00:00:00.000000,15
144,7,2013-06-06 00:00:00.000000,15


### Tag Variable (0/1/2 - 0/1.5/3.0 IQR cutoff)

In [8]:
# define column of interest (outcome)
df = df_sed
outcome = 'sed_total'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    
    if(row[outcome] > high_2):
        val = 2
    elif(row[outcome] > high_1):
        val = 1
    else:
        val = 0

    return val


#get outliers high bound (1.5/3.0 IQR)
high_1 = outliers(df, outcome, 1.5)[1]
high_2 = outliers(df, outcome, 3.0)[1]


outlier_name = outcome + '_outlier'
df[outlier_name] = df.apply(outliers_tag, axis=1)

In [9]:
df.head(5)

Unnamed: 0,user_id,upload_time,dayp1_minute,dayp2_minute,dayp3_minute,dayp4_minute,sed_total,sed_total_outlier
0,14,2012-08-13,40,0,0,60,100,0
1,14,2012-08-14,60,0,30,0,90,0
2,9,2012-08-14,30,0,0,90,120,0
3,14,2012-08-15,0,60,0,180,240,1
4,9,2012-08-15,45,0,0,120,165,0


In [10]:
# save locally
df.to_csv('Result/sed_clean.csv', index=False)

### PA (Actigraph data: shimmer counts)

In [11]:
# clean app data
df_pa_app = df_pa_app[['tstamp_phone','user_id', 'min']]
df_pa_app.columns = ['tstamp_phone','user_id', 'pa_minute_app']

### Cutoff Values

In [12]:
def shimmer(df_pa_device, df_pa_app, cutoff):
    #get day-level date
    df_pa_device['upload_time'] = df_pa_device['tstamp'].astype('string').str[:10]
    df_pa_app['upload_time'] = df_pa_app['tstamp_phone'].astype('string').str[:10]
    
    #active minute
    df_pa_device_active = df_pa_device[df_pa_device['pa_count'] >= 1900]
    if (cutoff != 0):
        df_pa_device_active = df_pa_device_active[df_pa_device_active['pa_count'] < cutoff]
    
    #group by user and each day, count total physical activity minutes per day
    df_pa_device_count = ps.sqldf("select user_id, upload_time, count(pa_count) as pa_minute_shimmer from df_pa_device_active group by user_id, upload_time")
    
    #merge with app data
    df_pa_all = pd.merge(df_pa_device_count, df_pa_app, how='outer', on=['upload_time', 'user_id'])
    df_pa_all = df_pa_all.drop(['tstamp_phone'], axis=1)
    
    #fill missing values with 0
    df_pa_all['pa_minute_app'] = df_pa_all['pa_minute_app'].fillna(0)
    df_pa_all['pa_minute_shimmer'] = df_pa_all['pa_minute_shimmer'].fillna(0)

    #get combined outcome (sum)
    df_pa_all['MVPA_min'] = df_pa_all['pa_minute_app'] + df_pa_all['pa_minute_shimmer']
    
    return(df_pa_all)

In [13]:
#combined after cutoff
combine = shimmer(df_pa_device, df_pa_app, 0)
combine_16000 = shimmer(df_pa_device, df_pa_app, 16000)
combine_20000 = shimmer(df_pa_device, df_pa_app, 20000)

In [14]:
# merge 3 different Shimmer together
# rename 
combine.columns = ['user_id', 'upload_time', 'pa_minute_shimmer','pa_minute_app','MVPA_min']
combine_16000.columns = ['user_id', 'upload_time', 'pa_minute_shimmer_16k','pa_minute_app_16k','MVPA_min_16k']
combine_20000.columns = ['user_id', 'upload_time', 'pa_minute_shimmer_20k','pa_minute_app_20k','MVPA_min_20k']

# merge
result_1 = pd.merge(combine, combine_16000, how="outer", on=["user_id", "upload_time"])
result_2 = pd.merge(result_1, combine_20000, how="outer", on=["user_id", "upload_time"])

# fill missing
result_2 = result_2.fillna(0)

result_2.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_shimmer,pa_minute_app,MVPA_min,pa_minute_shimmer_16k,pa_minute_app_16k,MVPA_min_16k,pa_minute_shimmer_20k,pa_minute_app_20k,MVPA_min_20k
0,1,2012-08-27,2.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,2.0
1,1,2012-09-07,2.0,20.0,22.0,2.0,20.0,22.0,2.0,20.0,22.0
2,1,2012-09-08,19.0,0.0,19.0,19.0,0.0,19.0,19.0,0.0,19.0
3,1,2012-09-11,5.0,0.0,5.0,5.0,0.0,5.0,5.0,0.0,5.0
4,1,2012-09-16,4.0,20.0,24.0,4.0,20.0,24.0,4.0,20.0,24.0


### Tag Variable (0/1/2 - 0/1.5/3.0 IQR cutoff)

In [15]:
#### combined 0 cutoff
# define column of interest (outcome)
df_pa = result_2
outcome = 'MVPA_min'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if(row[outcome] > high_2):
        val = 2
    elif(row[outcome] > high_1):
        val = 1
    else:
        val = 0
    return val

#get outliers high bound (1.5/3.0 IQR)
high_1 = outliers(df_pa, outcome, 1.5)[1]
high_2 = outliers(df_pa, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df_pa[outlier_name] = df_pa.apply(outliers_tag, axis=1)

In [16]:
#### combined 0 cutoff
# define column of interest (outcome)
outcome = 'MVPA_min_16k'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if(row[outcome] > high_2):
        val = 2
    elif(row[outcome] > high_1):
        val = 1
    else:
        val = 0
    return val

#get outliers high bound (1.5/3.0 IQR)
high_1 = outliers(df_pa, outcome, 1.5)[1]
high_2 = outliers(df_pa, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df_pa[outlier_name] = df_pa.apply(outliers_tag, axis=1)

In [17]:
#### combined 0 cutoff
# define column of interest (outcome)
outcome = 'MVPA_min_20k'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if(row[outcome] > high_2):
        val = 2
    elif(row[outcome] > high_1):
        val = 1
    else:
        val = 0
    return val

#get outliers high bound (1.5/3.0 IQR)
high_1 = outliers(df_pa, outcome, 1.5)[1]
high_2 = outliers(df_pa, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df_pa[outlier_name] = df_pa.apply(outliers_tag, axis=1)

In [18]:
df_pa.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_shimmer,pa_minute_app,MVPA_min,pa_minute_shimmer_16k,pa_minute_app_16k,MVPA_min_16k,pa_minute_shimmer_20k,pa_minute_app_20k,MVPA_min_20k,MVPA_min_outlier,MVPA_min_16k_outlier,MVPA_min_20k_outlier
0,1,2012-08-27,2.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,2.0,0,0,0
1,1,2012-09-07,2.0,20.0,22.0,2.0,20.0,22.0,2.0,20.0,22.0,0,0,0
2,1,2012-09-08,19.0,0.0,19.0,19.0,0.0,19.0,19.0,0.0,19.0,0,0,0
3,1,2012-09-11,5.0,0.0,5.0,5.0,0.0,5.0,5.0,0.0,5.0,0,0,0
4,1,2012-09-16,4.0,20.0,24.0,4.0,20.0,24.0,4.0,20.0,24.0,0,0,0


In [19]:
# save locally
df_pa.to_csv('Result/MVPA/MVPA.csv', index=False)