In [1]:
import pandas as pd
from zipfile import ZipFile

zip_file = ZipFile('archive.zip')
data_hrStp = pd.read_csv(zip_file.open('Fitabase Data 4.12.16-5.12.16/hourlySteps_merged.csv'), index_col=1)
data_hrCal = pd.read_csv(zip_file.open('Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv'), index_col=1)
data_mnSlp = pd.read_csv(zip_file.open('Fitabase Data 4.12.16-5.12.16/minuteSleep_merged.csv'), index_col=1)
data_mnInt = pd.read_csv(zip_file.open('Fitabase Data 4.12.16-5.12.16/minuteIntensitiesWide_merged.csv'), index_col=1)

data_hrStp.index = pd.to_datetime(data_hrStp.index)
data_hrCal.index = pd.to_datetime(data_hrCal.index)

In [2]:
## CONVERT minute Sleep file to hourly:

# Convert index to datetime (needed for resampling):
data_mnSlp.index = pd.to_datetime(data_mnSlp.index)

# Add separated columns for sleep quality: Per dictionary: 1 = asleep, 2 = restless, 3 = awake
data_mnSlp['1=asleep'] = data_mnSlp['value'].apply(lambda x: 1 if x == 1 else 0)
data_mnSlp['2=restless'] = data_mnSlp['value'].apply(lambda x: 1 if x == 2 else 0)
data_mnSlp['3=awake'] = data_mnSlp['value'].apply(lambda x: 1 if x == 3 else 0)
# Resample minute Sleep data into hourly Sleep data, keeping date and 3 columns for sleep quality
data_hrSlp = data_mnSlp[['1=asleep','2=restless','3=awake']].resample('H').sum()
data_hrSlp['Id'] = data_mnSlp['Id'].unique()[0]
data_hrSlp.index.name = 'ActivityHour'
data_hrSlp.head(5)


Unnamed: 0_level_0,1=asleep,2=restless,3=awake,Id
ActivityHour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-04-11 20:00:00,11,1,0,1503960366
2016-04-11 21:00:00,84,19,4,1503960366
2016-04-11 22:00:00,304,24,1,1503960366
2016-04-11 23:00:00,446,22,3,1503960366
2016-04-12 00:00:00,499,36,7,1503960366


In [3]:
## SUMMARIZE minute Intensities file to hourly:

data_mnInt.index = pd.to_datetime(data_mnInt.index)

# Add separated columns for intensity level: Per dictionary: 0 = Sedentary, 1 = Light, 2 = Moderate, 3 = Very Active
data_mnInt['0=sedentary'] = data_mnInt.eq(0).sum(axis=1)
data_mnInt['1=light'] = data_mnInt.eq(1).sum(axis=1)
data_mnInt['2=moderate'] = data_mnInt.eq(2).sum(axis=1)
data_mnInt['3=veryactive'] = data_mnInt.eq(3).sum(axis=1)

# Summarize by id/hour:
data_hrInt = data_mnInt[['Id','0=sedentary','1=light','2=moderate','3=veryactive']]
print('Summarized data:\n', data_hrInt.head(5))


Summarized data:
                              Id  0=sedentary  1=light  2=moderate  \
ActivityHour                                                        
2016-04-13 00:00:00  1503960366           46       14           0   
2016-04-13 01:00:00  1503960366           60        0           0   
2016-04-13 02:00:00  1503960366           60        0           0   
2016-04-13 03:00:00  1503960366           56        4           0   
2016-04-13 04:00:00  1503960366           60        0           0   

                     3=veryactive  
ActivityHour                       
2016-04-13 00:00:00             0  
2016-04-13 01:00:00             0  
2016-04-13 02:00:00             0  
2016-04-13 03:00:00             0  
2016-04-13 04:00:00             0  


In [4]:
#Counts:
print(data_hrStp.info())
print(data_hrCal.info())
print(data_hrSlp.info())
print(data_hrInt.info())
print()

merged_df = pd.merge(data_hrStp, data_hrCal, on=["Id", "ActivityHour"], how="inner")
merged_df = pd.merge(merged_df, data_hrInt, on=["Id", "ActivityHour"], how="left")
#merged_df = pd.merge(merged_df, data_hrSlp, on=["Id", "ActivityHour"], how="left")   # not working


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 22099 entries, 2016-04-12 00:00:00 to 2016-05-12 14:00:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Id         22099 non-null  int64
 1   StepTotal  22099 non-null  int64
dtypes: int64(2)
memory usage: 517.9 KB
None
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 22099 entries, 2016-04-12 00:00:00 to 2016-05-12 14:00:00
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Id        22099 non-null  int64
 1   Calories  22099 non-null  int64
dtypes: int64(2)
memory usage: 517.9 KB
None
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 734 entries, 2016-04-11 20:00:00 to 2016-05-12 09:00:00
Freq: H
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   1=asleep    734 non-null    int64
 1   2=restless  734 non-null    int64
 2   3=awake     73

In [5]:
merged_df.info()

#merged_df.fillna(0, inplace=True)
#export to excel for group assignment

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 22099 entries, 2016-04-12 00:00:00 to 2016-05-12 14:00:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            22099 non-null  int64  
 1   StepTotal     22099 non-null  int64  
 2   Calories      22099 non-null  int64  
 3   0=sedentary   21307 non-null  float64
 4   1=light       21307 non-null  float64
 5   2=moderate    21307 non-null  float64
 6   3=veryactive  21307 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 1.3 MB


To investigate:
- We lost 338 records from Intensities...
- We lost xxx records from Sleep...

In [6]:
print(merged_df.head())
print()
print(merged_df.tail())


                             Id  StepTotal  Calories  0=sedentary  1=light  \
ActivityHour                                                                 
2016-04-12 00:00:00  1503960366        373        81          NaN      NaN   
2016-04-12 01:00:00  1503960366        160        61          NaN      NaN   
2016-04-12 02:00:00  1503960366        151        59          NaN      NaN   
2016-04-12 03:00:00  1503960366          0        47          NaN      NaN   
2016-04-12 04:00:00  1503960366          0        48          NaN      NaN   

                     2=moderate  3=veryactive  
ActivityHour                                   
2016-04-12 00:00:00         NaN           NaN  
2016-04-12 01:00:00         NaN           NaN  
2016-04-12 02:00:00         NaN           NaN  
2016-04-12 03:00:00         NaN           NaN  
2016-04-12 04:00:00         NaN           NaN  

                             Id  StepTotal  Calories  0=sedentary  1=light  \
ActivityHour                          