## Sedentary and MVPA Outcomes Extraction
#### This notebook extracts 212 participants' sendentary and physical activity minutes from raw dataset with daily level granularity
#### The inputs and outputs are in csv format, by participant's study ID

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
#import raw tables
df_sed = pd.read_csv('Raw Data/sed_upload.csv')
df_pa_device = pd.read_csv('Raw Data/s2_counts.csv')
df_user = pd.read_csv('Raw Data/users.csv') 

### Extract Participants (212 from intervention period)

In [6]:
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()
print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


In [8]:
# selecting rows for participants only
df_sed= df_sed[df_sed['user_id'].isin(id_list)]
df_pa_device= df_pa_device[df_pa_device['user_id'].isin(id_list)]

#reset index
df_sed = df_sed.reset_index(drop=True)
df_pa_device = df_pa_device.reset_index(drop=True)

### SED (dayp1_minute + dayp2_minute + dayp3_minute +  dayp4_minute )

In [9]:
df_sed['sed_total'] = df_sed['dayp1_minute']+df_sed['dayp2_minute']+df_sed['dayp3_minute']+df_sed['dayp4_minute']

In [10]:
sed_list_included = ['user_id', 'when_sed', 'dayp1_minute','dayp2_minute', 'dayp3_minute', 'dayp4_minute', 'sed_total']
df_sed = df_sed[sed_list_included]
df_sed.columns = ['user_id', 'upload_time', 'dayp1_minute','dayp2_minute', 'dayp3_minute', 'dayp4_minute', 'sed_total']

In [11]:
#cleaned sedentary outcome table (version 1)
df_sed.head(5)

Unnamed: 0,user_id,upload_time,dayp1_minute,dayp2_minute,dayp3_minute,dayp4_minute,sed_total
0,1,2012-07-18,0,30,60,60,150
1,7,2012-07-20,0,30,30,30,90
2,8,2012-07-20,97,0,120,0,217
3,8,2012-07-21,71,60,90,180,401
4,7,2012-07-21,15,0,0,60,75


In [12]:
# merge with gender 
df_redcap = pd.read_csv('Raw Data/mbc2_redcap.csv') 
df_gender = df_redcap[['Participant ID', 'Sex']]
df_gender.columns = ['study_id', 'gender']
df_user_info = df_user[['study_id','user_id']]
df_info = df_gender.merge(df_user_info, on = 'study_id')
df_info['gender'] = df_info['gender'].astype('string').str[0]

### Tag Variable (0/1/2 - 0/1.5/3.0 IQR cutoff)

In [13]:
# merge gender and study_id to user_id
df_redcap = pd.read_csv('Raw Data/mbc2_redcap.csv') 
df_gender = df_redcap[['Participant ID', 'Sex']]
df_gender.columns = ['study_id', 'gender']
df_user_info = df_user[['study_id','user_id']]
df_info = df_gender.merge(df_user_info, on = 'study_id')
df_info['gender'] = df_info['gender'].astype('string').str[0]
df_sed = df_sed.merge(df_info, on = 'user_id')

In [14]:
# define column of interest (outcome)
df = df_sed
outcome = 'sed_total'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if (row['gender'] == '1'): #male
        if(row[outcome] > male_2):
            val = 2
        elif(row[outcome] > male_1):
            val = 1
        else:
            val = 0
    else: #female
        if(row[outcome] > female_2):
            val = 2
        elif(row[outcome] > female_1):
            val = 1
        else:
            val = 0
    return val

#get male's and female's 
df_male = df[df['gender'] == '1']
df_female = df[df['gender'] == '2']

#get outliers high bound (1.5/3.0 IQR)
male_1 = outliers(df_male, outcome, 1.5)[1]
female_1 = outliers(df_female, outcome, 1.5)[1]
male_2 = outliers(df_male, outcome, 3.0)[1]
female_2 = outliers(df_female, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df[outlier_name] = df.apply(outliers_tag, axis=1)

In [15]:
df.head(5)

Unnamed: 0,user_id,upload_time,dayp1_minute,dayp2_minute,dayp3_minute,dayp4_minute,sed_total,study_id,gender,sed_total_outlier
0,1,2012-07-18,0,30,60,60,150,1436,1,0
1,1,2012-07-30,0,0,0,120,120,1436,1,0
2,1,2012-07-31,60,0,0,120,180,1436,1,0
3,1,2012-08-01,90,0,0,90,180,1436,1,0
4,1,2012-08-02,0,0,60,120,180,1436,1,0


In [16]:
# save locally
df.to_csv('Result/sed_clean.csv', index=False)

### PA (Actigraph data: shimmer counts)

### Cutoff Values

In [22]:
def shimmer(df_pa_device, cutoff):
    #get day-level date
    df_pa_device['upload_time'] = df_pa_device['tstamp'].astype('string').str[:10]
    
    #active minute
    df_pa_device_active = df_pa_device[df_pa_device['pa_count'] >= 1900]
    if (cutoff != 0):
        df_pa_device_active = df_pa_device_active[df_pa_device_active['pa_count'] < cutoff]
    
    #group by user and each day, count total physical activity minutes per day
    df_pa_device_count = ps.sqldf("select user_id, upload_time, count(pa_count) as pa_minute_shimmer from df_pa_device_active group by user_id, upload_time")
    
    
    #fill missing values with 0
    df_pa_device_count['pa_minute_app'] = [0] * df_pa_device_count.shape[0]
    df_pa_device_count['pa_minute_shimmer'] = df_pa_device_count['pa_minute_shimmer'].fillna(0)

    #get combined outcome (sum)
    df_pa_device_count['MVPA_min'] = df_pa_device_count['pa_minute_app'] + df_pa_device_count['pa_minute_shimmer']
    
    return(df_pa_device_count)

In [23]:
#combined after cutoff
combine = shimmer(df_pa_device, 0)
combine_16000 = shimmer(df_pa_device, 16000)
combine_20000 = shimmer(df_pa_device, 20000)

In [24]:
# combined PA minutes (cleaned)
combine_16000.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_shimmer,pa_minute_app,MVPA_min
0,1,2012-07-18,14,0,14
1,1,2012-07-20,1,0,1
2,1,2012-07-23,2,0,2
3,1,2012-07-25,1,0,1
4,1,2012-08-13,2,0,2


### Tag Variable (0/1/2 - 0/1.5/3.0 IQR cutoff)

In [25]:
# merge gender and study_id to user_id
df_redcap = pd.read_csv('Raw Data/mbc2_redcap.csv') 
df_gender = df_redcap[['Participant ID', 'Sex']]
df_gender.columns = ['study_id', 'gender']
df_user_info = df_user[['study_id','user_id']]
df_info = df_gender.merge(df_user_info, on = 'study_id')
df_info['gender'] = df_info['gender'].astype('string').str[0]

df_pa_0 = combine.merge(df_info, on = 'user_id')
df_pa_16k = combine_16000 .merge(df_info, on = 'user_id')
df_pa_20k = combine_20000 .merge(df_info, on = 'user_id')

In [30]:
#### combined 0 cutoff
# define column of interest (outcome)
df = df_pa_0
outcome = 'MVPA_min'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if (row['gender'] == '1'): #male
        if(row[outcome] > male_2):
            val = 2
        elif(row[outcome] > male_1):
            val = 1
        else:
            val = 0
    else: #female
        if(row[outcome] > female_2):
            val = 2
        elif(row[outcome] > female_1):
            val = 1
        else:
            val = 0
    return val

#get male's and female's 
df_male = df[df['gender'] == '1']
df_female = df[df['gender'] == '2']

#get outliers high bound (1.5/3.0 IQR)
male_1 = outliers(df_male, outcome, 1.5)[1]
female_1 = outliers(df_female, outcome, 1.5)[1]
male_2 = outliers(df_male, outcome, 3.0)[1]
female_2 = outliers(df_female, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df[outlier_name] = df.apply(outliers_tag, axis=1)

#save locally
df.to_csv('Result/MVPA/pa_combined.csv', index=False)

In [28]:
#### combined 0 cutoff
# define column of interest (outcome)
df = df_pa_16k
outcome = 'MVPA_min'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if (row['gender'] == '1'): #male
        if(row[outcome] > male_2):
            val = 2
        elif(row[outcome] > male_1):
            val = 1
        else:
            val = 0
    else: #female
        if(row[outcome] > female_2):
            val = 2
        elif(row[outcome] > female_1):
            val = 1
        else:
            val = 0
    return val

#get male's and female's 
df_male = df[df['gender'] == '1']
df_female = df[df['gender'] == '2']

#get outliers high bound (1.5/3.0 IQR)
male_1 = outliers(df_male, outcome, 1.5)[1]
female_1 = outliers(df_female, outcome, 1.5)[1]
male_2 = outliers(df_male, outcome, 3.0)[1]
female_2 = outliers(df_female, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df[outlier_name] = df.apply(outliers_tag, axis=1)

#save locally
df.to_csv('Result/MVPA/pa_combined_16k.csv', index=False)

In [29]:
#### combined 0 cutoff
# define column of interest (outcome)
df = df_pa_20k
outcome = 'MVPA_min'

def outliers(df, column, IQR_level):
    l = list(df[column])
    for i in range(0, len(l)): 
        l[i] = float(l[i])
    
    #outlier percentage
    q1 = np.percentile(l, 25)  
    q3 = np.percentile(l, 75)
    IQR = q3-q1
    low = q1 - IQR_level*IQR
    high = q3 + IQR_level*IQR
    
    return([low, high])

def outliers_tag(row):
    if (row['gender'] == '1'): #male
        if(row[outcome] > male_2):
            val = 2
        elif(row[outcome] > male_1):
            val = 1
        else:
            val = 0
    else: #female
        if(row[outcome] > female_2):
            val = 2
        elif(row[outcome] > female_1):
            val = 1
        else:
            val = 0
    return val

#get male's and female's 
df_male = df[df['gender'] == '1']
df_female = df[df['gender'] == '2']

#get outliers high bound (1.5/3.0 IQR)
male_1 = outliers(df_male, outcome, 1.5)[1]
female_1 = outliers(df_female, outcome, 1.5)[1]
male_2 = outliers(df_male, outcome, 3.0)[1]
female_2 = outliers(df_female, outcome, 3.0)[1]

outlier_name = outcome + '_outlier'
df[outlier_name] = df.apply(outliers_tag, axis=1)

#save locally
df.to_csv('Result/MVPA/pa_combined_20k.csv', index=False)