In [341]:
import pandas as pd
import os

from datetime import datetime
import random
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

In [2]:
raw_data_path = "../raw_data/"

file_names = os.listdir(raw_data_path)
file_path = [f'{raw_data_path}{filename}' for filename in file_names]

In [3]:
# read all data

# Intensities
df_minuteIntensitiesWide = pd.read_csv(file_path[0])
df_minuteIntensitiesNarrow = pd.read_csv(file_path[4])
df_hourlyIntensities = pd.read_csv(file_path[7])
df_dailyIntensities = pd.read_csv(file_path[17])

# Calories
df_minuteCaloriesWide = pd.read_csv(file_path[13])
df_minuteCaloriesNarrow = pd.read_csv(file_path[11])
df_hourlyCalories = pd.read_csv(file_path[9])
df_dailyCalories = pd.read_csv(file_path[8])

# Step
df_minuteStepsWide = pd.read_csv(file_path[15])
df_minuteStepsNarrow = pd.read_csv(file_path[14])
df_hourlySteps = pd.read_csv(file_path[6])
df_dailySteps = pd.read_csv(file_path[12])

# Sleep
df_sleepDay = pd.read_csv(file_path[2])
df_minuteSleep = pd.read_csv(file_path[10])

# HeartRate
df_heartrate_seconds = pd.read_csv(file_path[3])

# Standalone
df_minuteMETsNarrow = pd.read_csv(file_path[5])
df_weightLogInfo = pd.read_csv(file_path[16])
df_dailyActivity = pd.read_csv(file_path[1])

The brand is focusing on women audience and health focused, one of the aspect to determine health condition is METs and Weight. So in the next few parts, I will try to find the correlation between those 2 aspect with intensities, calories, steps, sleep, and heartRate. Let's take a look at the data table that contains weight or METs. 

In [4]:
display(df_weightLogInfo.head(5))
print(f'Total Data point : {len(df_weightLogInfo)}')

Unnamed: 0,Id,Date,WeightKg,WeightPounds,Fat,BMI,IsManualReport,LogId
0,1503960366,5/2/2016 11:59:59 PM,52.599998,115.963147,22.0,22.65,True,1462233599000
1,1503960366,5/3/2016 11:59:59 PM,52.599998,115.963147,,22.65,True,1462319999000
2,1927972279,4/13/2016 1:08:52 AM,133.5,294.31712,,47.540001,False,1460509732000
3,2873212765,4/21/2016 11:59:59 PM,56.700001,125.002104,,21.450001,True,1461283199000
4,2873212765,5/12/2016 11:59:59 PM,57.299999,126.324875,,21.690001,True,1463097599000


Total Data point : 67


sadly this table only store 67 data, let's check on how many Ids have their weight data and who?

In [5]:
print(df_weightLogInfo['Id'].value_counts())
print(f'Number of Id : {df_weightLogInfo["Id"].nunique()}')

6962181067    30
8877689391    24
4558609924     5
2873212765     2
1503960366     2
4319703577     2
5577150313     1
1927972279     1
Name: Id, dtype: int64
Number of Id : 8


In [71]:
df_weightLogInfo['Datetime'] = pd.to_datetime(df_weightLogInfo['Date'])
df_weightLogInfo['Date'] = [d.date() for d in df_weightLogInfo['Datetime']]
df_weightLogInfo['Time'] = [d.time() for d in df_weightLogInfo['Datetime']]

In [72]:
df_weightLogInfo

Unnamed: 0,Id,Date,WeightKg,WeightPounds,Fat,BMI,IsManualReport,LogId,Datetime,Time
0,1503960366,2016-05-02,52.599998,115.963147,22.0,22.650000,True,1462233599000,2016-05-02 23:59:59,23:59:59
1,1503960366,2016-05-03,52.599998,115.963147,,22.650000,True,1462319999000,2016-05-03 23:59:59,23:59:59
2,1927972279,2016-04-13,133.500000,294.317120,,47.540001,False,1460509732000,2016-04-13 01:08:52,01:08:52
3,2873212765,2016-04-21,56.700001,125.002104,,21.450001,True,1461283199000,2016-04-21 23:59:59,23:59:59
4,2873212765,2016-05-12,57.299999,126.324875,,21.690001,True,1463097599000,2016-05-12 23:59:59,23:59:59
...,...,...,...,...,...,...,...,...,...,...
62,8877689391,2016-05-06,85.000000,187.392923,,25.440001,False,1462517015000,2016-05-06 06:43:35,06:43:35
63,8877689391,2016-05-08,85.400002,188.274775,,25.559999,False,1462692953000,2016-05-08 07:35:53,07:35:53
64,8877689391,2016-05-09,85.500000,188.495234,,25.610001,False,1462775984000,2016-05-09 06:39:44,06:39:44
65,8877689391,2016-05-11,85.400002,188.274775,,25.559999,False,1462949507000,2016-05-11 06:51:47,06:51:47


In [110]:
fig = px.line(df_weightLogInfo, 
              x='Date', 
              y="WeightKg", 
              title='Weight Data plot',
              color='Id',
)
fig.add_scatter(x=df_weightLogInfo['Date'], 
                y=df_weightLogInfo['WeightKg'],
                mode='markers', 
                marker_size=5,
                fillcolor='red'
)
fig.show()

In [103]:
df_user_weight = df_weightLogInfo[['Id', 'WeightKg', 'BMI']].groupby('Id').max()
df_user_weight.reset_index(inplace=True)

df_user_weight.head(10)

Unnamed: 0,Id,WeightKg,BMI
0,1503960366,52.599998,22.65
1,1927972279,133.5,47.540001
2,2873212765,57.299999,21.690001
3,4319703577,72.400002,27.450001
4,4558609924,70.300003,27.459999
5,5577150313,90.699997,28.0
6,6962181067,62.5,24.389999
7,8877689391,85.800003,25.68


There's only 8 Id (out of 33) that have weight data within 67 data points, and 54 of those belongs to 2 Id. afterall those weight data doesnt have any significant changes, so we might not be able to get anything from changes of the weight data. Because the weight is very lightly changed troughout the timeline, we will use the average weight of the person to represent him/her. Let's now check on the next attribute, `METs`

In [6]:
df_minuteMETsNarrow.head(3)

Unnamed: 0,Id,ActivityMinute,METs
0,1503960366,4/12/2016 12:00:00 AM,10
1,1503960366,4/12/2016 12:01:00 AM,10
2,1503960366,4/12/2016 12:02:00 AM,10


since I'm not familiar yet with METs let's take a look on how METs changes troughout the day

In [7]:
df_30minuteMETs = df_minuteMETsNarrow.copy()
df_30minuteMETs['ActivityMinute'] = pd.to_datetime(df_30minuteMETs['ActivityMinute'])

In [8]:
df_30minuteMETs['Date'] = [d.date() for d in df_30minuteMETs['ActivityMinute']]
df_30minuteMETs['Time'] = [d.time() for d in df_30minuteMETs['ActivityMinute']]

In [9]:
# filter out the time to 10 Minutes
# this is only used for data visualization

time_idx = pd.date_range("01-01-01", periods=144, freq="10min").time
df_30minuteMETs = df_30minuteMETs[df_30minuteMETs['Time'].isin(time_idx)]

In [75]:
METs_groupbyTimeId_df = df_30minuteMETs.groupby(['Time', 'Id']).mean()

random_3_id = random.sample(set(METs_groupbyTimeId_df.index.get_level_values(1)),3)
METs_groupby_random_3_df = METs_groupbyTimeId_df[METs_groupbyTimeId_df.index.isin(random_3_id, level=1)]

fig = px.line(METs_groupby_random_3_df, 
              x=METs_groupby_random_3_df.index.get_level_values(0), 
              y="METs", 
              title='METs changes troughout the day',
              color=METs_groupby_random_3_df.index.get_level_values(1),
)
fig.update_layout(hovermode="x")
fig.show()

based on those sample, METs can change rapidly within minutes. We can also see that it's really low at early daytime. Let's have a look with all the data involved, because it'll be chaos if we display every Id, we will get the average METs every 10 min to get the full picture of the pattern (if exist).

In [65]:
METs_groupbyTime_df = df_30minuteMETs.groupby(['Time']).mean()

fig = px.line(METs_groupbyTime_df, 
              x=METs_groupbyTime_df.index.get_level_values(0), 
              y="METs", 
              title='METs changes troughout the day'
)
fig.update_layout(hovermode="x")
fig.show()

It's true that METs is lower during sleep time rather than woke up time. Since now we get the general idea on how METs changes troughout the day, Let's proceed to find a pattern or correlation for weight, METs, intensities, step, calories, sleep, and heart rate. To make it easier, let's try to find a patter within the `dailyActifity` table.

In [106]:
df_dailyActivity.head(3)

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776


My main focus for this table is to look at what are the effect of each features in respect to calories. From the first sight, i see that there's 2 key differences which is distance (Steps & Distance) and minutes (time). so for the next few visualization we will see on what is the best way to achieve bigger calories. Let's have a look first at the Total steps vs total distance, because my instinct is if the total steps is high but the distance is low, the activity might not be very active and will take a lot of time.

Since we already have TotalSteps and Total Distance, Let's make a new column called Active Minutes. I will accumulate veryActiveMinutes, FairlyActivesMinutes, and LightlyActiveMinutes. we're not going to use sedentary Minutes, because based on dictionary it's considered little to no exercise so it wont effect much to the calories.

0      366
1      257
2      222
3      272
4      267
      ... 
935    266
936    309
937    253
938    313
939    161
Length: 940, dtype: int64

In [135]:
activeMinutesCol = ['VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes']

df_dailyActivity['TotalActiveMinutes'] = df_dailyActivity[activeMinutesCol].sum(axis=1)

In [136]:
df_dailyActivity[['TotalSteps', 'TotalDistance', 'TotalActiveMinutes', 'Calories']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TotalSteps,940.0,7637.910638,5087.150742,0.0,3789.75,7405.5,10727.0,36019.0
TotalDistance,940.0,5.489702,3.924606,0.0,2.62,5.245,7.7125,28.030001
TotalActiveMinutes,940.0,227.542553,121.776307,0.0,146.75,247.0,317.25,552.0
Calories,940.0,2303.609574,718.166862,0.0,1828.5,2134.0,2793.25,4900.0


In [175]:
fig = px.scatter(df_dailyActivity, 
                 x="TotalSteps", 
                 y="TotalDistance",
                 trendline="ols",
                 title="Total Step vs Total Distance")
fig.data[1].line.color = 'red'
fig.show()

it turns out that total step and total distance have a very linear relationship. but after 15k steps or 10 (I don't know the metric) distance, there seems to achieve higher distance with less total steps. Let's have a look on those data.

In [119]:
df_high_dist_steps = df_dailyActivity.loc[(
    df_dailyActivity['TotalSteps'] > 15000) | (df_dailyActivity['TotalDistance'] > 10)]

In [125]:
df_high_dist_steps["Id"] = df_high_dist_steps["Id"].apply(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [164]:
fig = px.scatter(df_high_dist_steps, 
                 x="TotalSteps", 
                 y="TotalDistance",
                 color="Id",
                 title="Total Distance vs Total Step (Zoomed in on higher value)")
fig.show()

it turns out that those "anomaly" is caused by only 1 user with id "8877689391" which is interesting, and we have the weight data on that user! so we will do some checking if needed. Since TotalDistance is highly correlated with TotalSteps, we can choose one of them and it will represent the other. so, I will choose TotalSteps and ignore TotalDistance.

In [173]:
fig = px.scatter(df_dailyActivity, 
                 x="TotalSteps", 
                 y="TotalActiveMinutes",
                 trendline="ols",
                 title="Total Active Minutes vs Total Step",
                 hover_data=["Id"]
)
fig.data[1].line.color = 'red'
fig.show()

it turns out that Total Steps and Total Active Minutes have a good linear correlation. Even though there are something unusual where a user have a total Step but no active minutes. But after I've hover through their ID, it only happends to 2 different user which is user "4319703577" and "8583815059", so there might be some issues with the software/hardware.

In [203]:
df = px.data.tips()
fig = px.histogram(df_dailyActivity, 
                   x="Calories", 
                   marginal="rug"
)
fig.show()

![Women Daily Calories](Images/womenDailyCalories.png)
![Men Daily Calories](Images/menDailyCalories.png)

based on the Image above, our data is somewhat accurate. Most of the data lies between 1200 and 3800. but there are data that's more or less than that. from my believe less might means the user might not use the app 24/7 so the calculation is off. but for the data that's more than 3.8k, it might be because the user is a male, because male do have higher daily calorie burn. So from this we know that our application can catch the data accurately, If I got some free time, we might going to check the data on those outliers (less than 1200 and more than 3800). to make a better visualization later, let's make a manually made cluster for calories per day. For this, I will make 10 groups with 500 gap each.

In [310]:
cal_dict = [f'{(n-1)*500} - {n*500}' for n in range(1, 11)]

In [311]:
calories_clust = ['< 1000'] + [f'{(n-1)*500} - {n*500}' for n in range(2, 10)] + ['> 4500']

def getCaloriesCluster(x):
    global cluster
    if x > 4500:
        return '> 4500'
    if x < 1000:
        return '< 1000'
    else:
        return calories_clust[math.floor(x/500)]

In [312]:
getCaloriesCluster(1600)

'1500 - 2000'

In [313]:
df_dailyActivity['CaloriesCluster'] = df_dailyActivity['Calories'].apply(getCaloriesCluster)

In [291]:
fig = px.scatter(df_dailyActivity, 
                 x="TotalSteps", 
                 y="Calories",
                 trendline="ols",
                 title="Calories vs Total Steps",
                 hover_data=["Id"]
)
fig.data[1].line.color = 'red'
fig.show()

In [292]:
fig = px.scatter(df_dailyActivity, 
                 x="TotalActiveMinutes", 
                 y="Calories",
                 trendline="ols",
                 title="Calories vs Total Active Minutes",
                 hover_data=["Id"]
)
fig.data[1].line.color = 'red'
fig.show()

for calories, there is actually something interesting here, where the Total Steps (or Total Distance) have more effect on calories rather than active minutes. So having a lot of active minutes doesn't mean high calories but high total steps leads to higher calories. But since we've combined all the Highly, Fairly, and Lightly Active minutes, it might be because the lightly active are more dominant, now we're going to find out does each of the stage of active minutes actually represent something.


In [293]:
df_dailyActivity[activeMinutesCol].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VeryActiveMinutes,940.0,21.164894,32.844803,0.0,0.0,4.0,32.0,210.0
FairlyActiveMinutes,940.0,13.564894,19.987404,0.0,0.0,6.0,19.0,143.0
LightlyActiveMinutes,940.0,192.812766,109.1747,0.0,127.0,199.0,264.0,518.0
Calories,940.0,2303.609574,718.166862,0.0,1828.5,2134.0,2793.25,4900.0


In [338]:
print(f'No Lightly Active Minutes : {(df_dailyActivity["LightlyActiveMinutes"] == 0).sum()/len(df_dailyActivity)*100}%')
print(f'No Fairly Active Minutes : {(df_dailyActivity["FairlyActiveMinutes"] == 0).sum()/len(df_dailyActivity)*100}%')
print(f'No Very Active Minutes : {(df_dailyActivity["VeryActiveMinutes"] == 0).sum()/len(df_dailyActivity)*100}%')

No Lightly Active Minutes : 8.936170212765958%
No Fairly Active Minutes : 40.85106382978723%
No Very Active Minutes : 43.51063829787234%


In [339]:
color_seq = [
    'rgb(255,255,255)',
    'rgb(237,248,177)',
    'rgb(199,233,180)',
    'rgb(127,205,187)',
    'rgb(65,182,196)',
    'rgb(34,94,168)',
    'rgb(37,52,148)',
    'rgb(255,0,0)',
    'rgb(255,0,0)',
]

In [360]:
df_dailyActivity

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,TotalActiveMinutes,CaloriesCluster
0,1503960366,4/12/2016,13162,8.500000,8.500000,0.0,1.88,0.55,6.06,0.00,25,13,328,728,1985,366,1500 - 2000
1,1503960366,4/13/2016,10735,6.970000,6.970000,0.0,1.57,0.69,4.71,0.00,21,19,217,776,1797,257,1500 - 2000
2,1503960366,4/14/2016,10460,6.740000,6.740000,0.0,2.44,0.40,3.91,0.00,30,11,181,1218,1776,222,1500 - 2000
3,1503960366,4/15/2016,9762,6.280000,6.280000,0.0,2.14,1.26,2.83,0.00,29,34,209,726,1745,272,1500 - 2000
4,1503960366,4/16/2016,12669,8.160000,8.160000,0.0,2.71,0.41,5.04,0.00,36,10,221,773,1863,267,1500 - 2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,8877689391,5/8/2016,10686,8.110000,8.110000,0.0,1.08,0.20,6.80,0.00,17,4,245,1174,2847,266,2500 - 3000
936,8877689391,5/9/2016,20226,18.250000,18.250000,0.0,11.10,0.80,6.24,0.05,73,19,217,1131,3710,309,3500 - 4000
937,8877689391,5/10/2016,10733,8.150000,8.150000,0.0,1.35,0.46,6.28,0.00,18,11,224,1187,2832,253,2500 - 3000
938,8877689391,5/11/2016,21420,19.559999,19.559999,0.0,13.22,0.41,5.89,0.00,88,12,213,1127,3832,313,3500 - 4000


In [359]:
fig = px.bar(df_dailyActivity, x="Calories", y="%", color="week", title="Title", barmode='stack')
fig.update_layout(xaxis_title='Name', yaxis=dict(tickformat="%",))
fig.show()

In [340]:
fig = px.scatter(df_dailyActivity, 
                 x="VeryActiveMinutes", 
                 y="Calories",
                 color="CaloriesCluster",
                 category_orders={'CaloriesCluster' : calories_clust},
                 color_discrete_sequence=color_seq,
                 title="Calories vs Total Active Minutes",
                 hover_data=["Id", "Calories"]
)
fig.show()

In [325]:
fig = px.scatter(df_dailyActivity, 
                 x="LightlyActiveMinutes", 
                 y="VeryActiveMinutes",
                 color="CaloriesCluster",
                 category_orders={'CaloriesCluster' : calories_clust},
                 color_discrete_sequence=color_seq,
                 title="Calories vs Total Active Minutes",
                 hover_data=["Id"]
)
fig.show()

From both of those plot above, we know that Very Active Minutes have a huge role on calories, so because of this, for the next comparison between LightlyActiveMinutes and FairlyActiveMinutes, we will filter it out so we dont have any data with VeryActiveMinutes (VeryActiveMinutes = 0)

In [329]:
fig = px.scatter(df_dailyActivity, 
                 x="LightlyActiveMinutes", 
                 y="FairlyActiveMinutes",
                 color="CaloriesCluster",
                 category_orders={'CaloriesCluster' : calories_clust},
                 color_discrete_sequence=color_seq,
                 title="Calories vs Total Active Minutes",
                 hover_data=["Id"]
)
fig.show()

In [328]:
df_noActiveMinutes_DailyActivity = df_dailyActivity.loc[df_dailyActivity["VeryActiveMinutes"] == 0]

fig = px.scatter(df_noActiveMinutes_DailyActivity, 
                 x="LightlyActiveMinutes", 
                 y="FairlyActiveMinutes",
                 color="CaloriesCluster",
                 category_orders={'CaloriesCluster' : calories_clust},
                 color_discrete_sequence=color_seq,
                 title="Calories vs Total Active Minutes",
                 hover_data=["Id"]
)
fig.show()

In [335]:
df_dailyActivity['VeryActiveMinutes'].value_counts()

0      409
1       23
2       18
3       16
8       15
      ... 
100      1
105      1
106      1
107      1
210      1
Name: VeryActiveMinutes, Length: 122, dtype: int64

In [None]:
It seems like Fairly Active Minutes is mostly 0

In [None]:
coba plot data lightly vs fairly tanpa ada data yang very

In [None]:
# intensities
df_minuteIntensitiesNarrow

# step
df_minuteStepsNarrow

# calories
df_minuteCaloriesNarrow

# sleep
df_minuteSleep

# heart rate
df_heartrate_seconds