## Import Packages
Let's start by loading the libraries we would need for processing and visualizing the data.

In [1]:
import numpy as np # data arrays
import pandas as pd # data structure and data analysis
import matplotlib as plt # data visualization
import seaborn as sns # data visualisation
import datetime as dt # date time
import os # file handling

## Import data
Next, we'll load the fitbit datasets we're interested in exploring for insights.

In [14]:
# File path to all csv
dir_path = '../Data/'

# Read all CSVs into DataFrames
daily_activity = pd.read_csv(f'{dir_path}dailyActivity_merged.csv')
hourly_steps = pd.read_csv(f'{dir_path}hourlySteps_merged.csv')
hourly_calories = pd.read_csv(f'{dir_path}hourlyCalories_merged.csv')
sleepday = pd.read_csv(f'{dir_path}sleepDay_merged.csv')
daily_calories = pd.read_csv(f'{dir_path}dailyCalories_merged.csv')
daily_intensity = pd.read_csv(f'{dir_path}dailyCalories_merged.csv')
weight_log = pd.read_csv(f'{dir_path}weightLogInfo_merged.csv')
daily_steps = pd.read_csv(f'{dir_path}dailySteps_merged.csv')
heartrate_secs = pd.read_csv(f"{dir_path}heartrate_seconds_merged.csv")

# Dictionary containing DataFrame variables and their names
dfs = {
    'daily_activity': daily_activity,
    'hourly_steps': hourly_steps,
    'hourly_calories': hourly_calories,
    'sleepday': sleepday,
    'daily_calories': daily_calories,
    'daily_intensity': daily_intensity,
    'weight_log': weight_log,
    'daily_steps': daily_steps,
    'heartrate_secs': heartrate_secs
}

## EDA
Let's explore our data by viewing some statistical information about them.
We'll start by pulling the first 5 rows of each dataset.

In [39]:
# Display each DataFrame along with its name
for name, df in dfs.items():
    print(f"DataFrame Name: {name}")
    display(df.head())
    print("_"*50)

DataFrame Name: daily_activity


Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


__________________________________________________
DataFrame Name: hourly_steps


Unnamed: 0,Id,ActivityHour,StepTotal
0,1503960366,4/12/2016 12:00:00 AM,373
1,1503960366,4/12/2016 1:00:00 AM,160
2,1503960366,4/12/2016 2:00:00 AM,151
3,1503960366,4/12/2016 3:00:00 AM,0
4,1503960366,4/12/2016 4:00:00 AM,0


__________________________________________________
DataFrame Name: hourly_calories


Unnamed: 0,Id,ActivityHour,Calories
0,1503960366,4/12/2016 12:00:00 AM,81
1,1503960366,4/12/2016 1:00:00 AM,61
2,1503960366,4/12/2016 2:00:00 AM,59
3,1503960366,4/12/2016 3:00:00 AM,47
4,1503960366,4/12/2016 4:00:00 AM,48


__________________________________________________
DataFrame Name: sleepday


Unnamed: 0,Id,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
0,1503960366,4/12/2016 12:00:00 AM,1,327,346
1,1503960366,4/13/2016 12:00:00 AM,2,384,407
2,1503960366,4/15/2016 12:00:00 AM,1,412,442
3,1503960366,4/16/2016 12:00:00 AM,2,340,367
4,1503960366,4/17/2016 12:00:00 AM,1,700,712


__________________________________________________
DataFrame Name: daily_calories


Unnamed: 0,Id,ActivityDay,Calories
0,1503960366,4/12/2016,1985
1,1503960366,4/13/2016,1797
2,1503960366,4/14/2016,1776
3,1503960366,4/15/2016,1745
4,1503960366,4/16/2016,1863


__________________________________________________
DataFrame Name: daily_intensity


Unnamed: 0,Id,ActivityDay,Calories
0,1503960366,4/12/2016,1985
1,1503960366,4/13/2016,1797
2,1503960366,4/14/2016,1776
3,1503960366,4/15/2016,1745
4,1503960366,4/16/2016,1863


__________________________________________________
DataFrame Name: weight_log


Unnamed: 0,Id,Date,WeightKg,WeightPounds,Fat,BMI,IsManualReport,LogId
0,1503960366,5/2/2016 11:59:59 PM,52.599998,115.963147,22.0,22.65,True,1462233599000
1,1503960366,5/3/2016 11:59:59 PM,52.599998,115.963147,,22.65,True,1462319999000
2,1927972279,4/13/2016 1:08:52 AM,133.5,294.31712,,47.540001,False,1460509732000
3,2873212765,4/21/2016 11:59:59 PM,56.700001,125.002104,,21.450001,True,1461283199000
4,2873212765,5/12/2016 11:59:59 PM,57.299999,126.324875,,21.690001,True,1463097599000


__________________________________________________
DataFrame Name: daily_steps


Unnamed: 0,Id,ActivityDay,StepTotal
0,1503960366,4/12/2016,13162
1,1503960366,4/13/2016,10735
2,1503960366,4/14/2016,10460
3,1503960366,4/15/2016,9762
4,1503960366,4/16/2016,12669


__________________________________________________
DataFrame Name: heartrate_secs


Unnamed: 0,Id,Time,Value
0,2022484408,4/12/2016 7:21:00 AM,97
1,2022484408,4/12/2016 7:21:05 AM,102
2,2022484408,4/12/2016 7:21:10 AM,105
3,2022484408,4/12/2016 7:21:20 AM,103
4,2022484408,4/12/2016 7:21:25 AM,101


__________________________________________________


Let's view the columns and their data types in each dataset.

In [38]:
for name, df in dfs.items():
    print(f"DataFrame Name: {name}")
    print(df.info())
    print("_"*50)

DataFrame Name: daily_activity
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDate              940 non-null    object 
 2   TotalSteps                940 non-null    int64  
 3   TotalDistance             940 non-null    float64
 4   TrackerDistance           940 non-null    float64
 5   LoggedActivitiesDistance  940 non-null    float64
 6   VeryActiveDistance        940 non-null    float64
 7   ModeratelyActiveDistance  940 non-null    float64
 8   LightActiveDistance       940 non-null    float64
 9   SedentaryActiveDistance   940 non-null    float64
 10  VeryActiveMinutes         940 non-null    int64  
 11  FairlyActiveMinutes       940 non-null    int64  
 12  LightlyActiveMinutes      940 non-null    int64  
 13  SedentaryMinutes          940 non-

The provided code iterates through each DataFrame in the `dfs` dictionary and prints various information about each DataFrame:

- **Shape:** Displays the number of rows and columns in the DataFrame.
- **Unique ids:** Shows the count of unique IDs present in the 'Id' column of the DataFrame.
- **Nulls:** Prints the total count of null values in the entire DataFrame.
- **Duplicates:** Displays the count of duplicated rows in the DataFrame.

In [50]:
for name, df in dfs.items():
    print(f"DataFrame Name: {name}")
    print("Shape: ",df.shape)
    print("Unique ids: ",df.Id.nunique())
    print("Nulls: ",df.isna().values.sum())
    print("Duplicates: ",df.duplicated().sum())
    print("_"*50)

DataFrame Name: daily_activity
Shape:  (940, 15)
Unique ids:  33
Nulls:  0
Duplicates:  0
__________________________________________________
DataFrame Name: hourly_steps
Shape:  (22099, 3)
Unique ids:  33
Nulls:  0
Duplicates:  0
__________________________________________________
DataFrame Name: hourly_calories
Shape:  (22099, 3)
Unique ids:  33
Nulls:  0
Duplicates:  0
__________________________________________________
DataFrame Name: sleepday
Shape:  (413, 5)
Unique ids:  24
Nulls:  0
Duplicates:  3
__________________________________________________
DataFrame Name: daily_calories
Shape:  (940, 3)
Unique ids:  33
Nulls:  0
Duplicates:  0
__________________________________________________
DataFrame Name: daily_intensity
Shape:  (940, 3)
Unique ids:  33
Nulls:  0
Duplicates:  0
__________________________________________________
DataFrame Name: weight_log
Shape:  (67, 8)
Unique ids:  8
Nulls:  65
Duplicates:  0
__________________________________________________
DataFrame Name: daily_step