## Sedentary and MVPA Outcomes Extraction
#### This notebook extracts 212 participants' sendentary and physical activity minutes from raw dataset with daily level granularity
#### The inputs and outputs are in csv format

In [2]:
import pandas as pd
import numpy as np
import pandasql as ps
import time 
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
#import tables
df_sed = pd.read_csv('Raw Data/sed_upload.csv')
df_pa = pd.read_csv('Raw Data/manual_min_upload.csv')
df_pa_device = pd.read_csv('Raw Data/s2_counts.csv')
df_user = pd.read_csv('Raw Data/users.csv') 

### Extract Participants (212 from intervention period)

In [4]:
#all participants
id_list = []
for i in range(df_user.shape[0]):
    #extract users' name starting with 'mbc2'
    if (df_user['username'][i][0:4] == 'mbc2'):
        id_list.append(df_user['user_id'][i])
id_list.sort()
print('There are', len(id_list), 'participants in intervention/follow-up period')

There are 212 participants in intervention/follow-up period


In [4]:
# selecting rows for participants only
df_sed= df_sed[df_sed['user_id'].isin(id_list)]
df_pa= df_pa[df_pa['user_id'].isin(id_list)]
df_pa_device= df_pa_device[df_pa_device['user_id'].isin(id_list)]

#reset index
df_sed = df_sed.reset_index(drop=True)
df_pa = df_pa.reset_index(drop=True)
df_pa_device = df_pa_device.reset_index(drop=True)

### SED (dayp1_minute + dayp2_minute + dayp3_minute +  dayp4_minute )

In [5]:
df_sed['sed_total'] = df_sed['dayp1_minute']+df_sed['dayp2_minute']+df_sed['dayp3_minute']+df_sed['dayp4_minute']

In [6]:
sed_list_included = ['user_id', 'when_sed', 'dayp1_minute','dayp2_minute', 'dayp3_minute', 'dayp4_minute', 'sed_total']
df_sed = df_sed[sed_list_included]

In [7]:
#cleaned sedentary outcome table
df_sed.head(5)

Unnamed: 0,user_id,when_sed,dayp1_minute,dayp2_minute,dayp3_minute,dayp4_minute,sed_total
0,14,2012-08-13,40,0,0,60,100
1,14,2012-08-14,60,0,30,0,90
2,9,2012-08-14,30,0,0,90,120
3,14,2012-08-15,0,60,0,180,240
4,9,2012-08-15,45,0,0,120,165


### PA (Manually entered time: min)

In [14]:
pa_list_included = ['user_id','tstamp_phone','min']
df_pa = df_pa[pa_list_included]

In [15]:
#cleaned self-raported MVPA outcome table
df_pa.head(5)

Unnamed: 0,user_id,tstamp_phone,min
0,14,2012-09-02 20:45:23,60
1,14,2012-09-02 20:46:01,90
2,1,2012-09-06 21:38:55,20
3,10000021,2012-09-07 11:21:24,30
4,1,2012-09-07 21:23:09,20


In [16]:
#get day-level date
df_pa['tstamp_phone'] = df_pa['tstamp_phone'].astype('string').str[:10]

#aggregate day-level minutes (sum)
df_pa= ps.sqldf("SELECT user_id, tstamp_phone as upload_time, sum(min) as pa_minute_app FROM df_pa group by user_id, tstamp_phone")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### MVPA (device)

In [24]:
df_pa_device.head(5)

Unnamed: 0,_id,user_id,uc_id,tstamp,pa_count,tstamp_server_uploaded,battery
0,1,10000020,1,2012-08-08 06:57:35,355,2012-08-08 07:01:19,2631
1,2,10000020,2,2012-08-08 06:58:35,54,2012-08-08 07:01:19,2631
2,3,10000020,3,2012-08-08 06:59:35,54,2012-08-08 07:01:19,2631
3,4,10000020,4,2012-08-08 07:00:35,54,2012-08-08 07:01:19,2631
4,5,10000020,5,2012-08-08 07:01:35,55,2012-08-08 07:05:28,2651


In [28]:
def extractMVPA_shimmer(df_pa, df_pa_device):
    #clean pa_upload 
    pa_list_included = ['user_id','tstamp_phone','min']
    df_pa = df_pa[pa_list_included]
    
    #get day-level date
    df_pa['tstamp_phone'] = df_pa['tstamp_phone'].astype('string').str[:10]

    #aggregate day-level minutes (sum)
    df_pa= ps.sqldf("SELECT user_id, tstamp_phone as upload_time, sum(min) as pa_minute_app FROM df_pa group by user_id, tstamp_phone")

    #get day-level date
    df_pa_device['upload_time'] = df_pa_device['tstamp'].astype('string').str[:10]
    
    #active minute
    df_pa_device_active = df_pa_device[df_pa_device['pa_count'] >= 1900]
    
    #group by user and each day, count total physical activity minutes per day
    df_pa_device_count = ps.sqldf("select user_id, upload_time, count(pa_count) as pa_minute_shimmer from df_pa_device_active group by user_id, upload_time")
    
    #merge two
    combine = pd.merge(df_pa, df_pa_device_count, how='outer', on=['user_id', 'upload_time'])
    
    #fill missing values with 0
    combine['pa_minute_app'] = combine['pa_minute_app'].fillna(0)
    combine['pa_minute_shimmer'] = combine['pa_minute_shimmer'].fillna(0)

    #get combined outcome (sum)
    combine['MVPA_min'] = combine['pa_minute_app'] + combine['pa_minute_shimmer']
    
    #more than 100000 as abnormal
    combine['Abnormal_counts'] = [1 if x >= 100000 else 0 for x in combine['pa_minute_shimmer']] 
    
    return(combine)

In [29]:
#all values (including)
df_pa_device_combine = extractMVPA_shimmer(df_pa, df_pa_device)
df_pa_device_combine.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,user_id,upload_time,pa_minute_app,pa_minute_shimmer,MVPA_min,Abnormal_counts
0,1,2012-09-06,20.0,0.0,20.0,0
1,1,2012-09-07,20.0,2.0,22.0,0
2,1,2012-09-10,20.0,0.0,20.0,0
3,1,2012-09-12,20.0,0.0,20.0,0
4,1,2012-09-16,20.0,4.0,24.0,0


In [32]:
df_pa_device_normal = df_pa_device_combine[df_pa_device_combine["Abnormal_counts"] == 0]
df_pa_device_normal.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_app,pa_minute_shimmer,MVPA_min,Abnormal_counts
0,1,2012-09-06,20.0,0.0,20.0,0
1,1,2012-09-07,20.0,2.0,22.0,0
2,1,2012-09-10,20.0,0.0,20.0,0
3,1,2012-09-12,20.0,0.0,20.0,0
4,1,2012-09-16,20.0,4.0,24.0,0


In [36]:
#only MVPA upload 
df_pa = df_pa_device_combine[['user_id','upload_time','pa_minute_app']]
df_pa.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_app
0,1,2012-09-06,20.0
1,1,2012-09-07,20.0
2,1,2012-09-10,20.0
3,1,2012-09-12,20.0
4,1,2012-09-16,20.0


In [37]:
#only MVPA shimmer
df_shimmer = df_pa_device_combine[['user_id','upload_time','pa_minute_shimmer']]
df_shimmer = df_shimmer[df_shimmer['pa_minute_shimmer']!=0]
df_shimmer.head(5)

Unnamed: 0,user_id,upload_time,pa_minute_shimmer
1,1,2012-09-07,2.0
4,1,2012-09-16,4.0
7,1,2012-09-19,1.0
8,1,2012-09-20,2.0
9,1,2012-09-23,11.0


### Save the results (local csv)

In [79]:
df_sed.to_csv('Result/sed_clean.csv', index=False)

In [39]:
df_pa.to_csv('Result/pa_clean.csv', index=False)

In [38]:
df_shimmer.to_csv('Result/pa_shimmer_clean.csv', index=False)

In [33]:
df_pa_device_normal.to_csv('Result/pa_merged_normal.csv', index=False)

In [30]:
df_pa_device_combine.to_csv('Result/pa_merged.csv', index=False)