In [1]:
import pandas as pd
import numpy as np
import os


In [2]:
os.makedirs('processed_data', exist_ok=True)
os.makedirs('visualizations', exist_ok=True)

In [3]:
daily_activity = pd.read_csv('dailyActivity_merged.csv')
sleep_day = pd.read_csv('sleepDay_merged.csv')
heart_rate = pd.read_csv('heartrate_seconds_merged.csv')

In [4]:
print("Missing values in daily_activity:\n", daily_activity.isnull().sum())
print("\nMissing values in sleep_day:\n", sleep_day.isnull().sum())
print("\nMissing values in heart_rate:\n", heart_rate.isnull().sum())

Missing values in daily_activity:
 Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64

Missing values in sleep_day:
 Id                    0
SleepDay              0
TotalSleepRecords     0
TotalMinutesAsleep    0
TotalTimeInBed        0
dtype: int64

Missing values in heart_rate:
 Id       0
Time     0
Value    0
dtype: int64


In [5]:
print(f"\nDuplicates in daily_activity: {daily_activity.duplicated().sum()}")
print(f"Duplicates in sleep_day: {sleep_day.duplicated().sum()}")
print(f"Duplicates in heart_rate: {heart_rate.duplicated().sum()}")


Duplicates in daily_activity: 0
Duplicates in sleep_day: 3
Duplicates in heart_rate: 0


In [6]:
sleep_day = sleep_day.drop_duplicates()

In [7]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

In [8]:
outliers_steps = detect_outliers(daily_activity, 'TotalSteps')
print(f"\nNumber of outliers in TotalSteps: {len(outliers_steps)}")
daily_activity = daily_activity[~daily_activity.index.isin(outliers_steps.index)]


Number of outliers in TotalSteps: 12


In [9]:
daily_activity['ActivityDate'] = pd.to_datetime(daily_activity['ActivityDate'])
sleep_day['SleepDay'] = pd.to_datetime(sleep_day['SleepDay'])
heart_rate['Time'] = pd.to_datetime(heart_rate['Time'])

  sleep_day['SleepDay'] = pd.to_datetime(sleep_day['SleepDay'])


In [10]:
daily_activity['ActivityDate'] = pd.to_datetime(daily_activity['ActivityDate'], format='%m/%d/%Y')
sleep_day['SleepDay'] = pd.to_datetime(sleep_day['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')
heart_rate['Time'] = pd.to_datetime(heart_rate['Time'], format='%m/%d/%Y %I:%M:%S %p')

In [11]:
daily_activity['DayOfWeek'] = daily_activity['ActivityDate'].dt.day_name()
daily_activity['IsWeekend'] = daily_activity['DayOfWeek'].isin(['Saturday', 'Sunday'])
daily_activity['ActivityIntensityRatio'] = daily_activity['VeryActiveMinutes'] / (daily_activity['FairlyActiveMinutes'] + daily_activity['LightlyActiveMinutes'] + 1)

In [12]:
merged_data = pd.merge(daily_activity, sleep_day, left_on=['Id', 'ActivityDate'], right_on=['Id', 'SleepDay'], how='left')

In [13]:
heart_rate_daily = heart_rate.groupby(['Id', heart_rate['Time'].dt.date])['Value'].mean().reset_index()
heart_rate_daily.columns = ['Id', 'Date', 'AvgHeartRate']

In [14]:
merged_data = pd.merge(merged_data, heart_rate_daily, left_on=['Id', 'ActivityDate'], right_on=['Id', 'Date'], how='left')

ValueError: You are trying to merge on datetime64[ns] and object columns for key 'ActivityDate'. If you wish to proceed you should use pd.concat

In [15]:
print(merged_data['ActivityDate'].dtype)
print(heart_rate_daily['Date'].dtype)

datetime64[ns]
object


In [16]:
heart_rate_daily['Date'] = pd.to_datetime(heart_rate_daily['Date'])

In [17]:
merged_data = pd.merge(merged_data, heart_rate_daily, left_on=['Id', 'ActivityDate'], right_on=['Id', 'Date'], how='left')

In [18]:
merged_data['TotalMinutesAsleep'].fillna(merged_data['TotalMinutesAsleep'].mean(), inplace=True)
merged_data['AvgHeartRate'].fillna(merged_data['AvgHeartRate'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['TotalMinutesAsleep'].fillna(merged_data['TotalMinutesAsleep'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['AvgHeartRate'].fillna(merged_data['AvgHeartRate'].mean(), inplace=True)


In [19]:
merged_data = merged_data.fillna({
    'TotalMinutesAsleep': merged_data['TotalMinutesAsleep'].mean(),
    'AvgHeartRate': merged_data['AvgHeartRate'].mean()
})

In [20]:
merged_data.to_csv('merged_fitness_data.csv', index=False)

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'matplotlib'

In [1]:
conda install matplotlib

error: incomplete escape \U at position 28