In [1]:
%pip install -q -r ../requirements.txt


Note: you may need to restart the kernel to use updated packages.


# Step 1: Project Setup and Data Loading

First, we import the necessary libraries for data manipulation, visualization, and machine learning. We then define the paths to our datasets and load them into pandas DataFrames.

In [2]:
import pandas as pd
import numpy as np
import os
import joblib
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define file paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_path = os.path.join(project_root, 'data')
activity_file = os.path.join(data_path, 'dailyActivity_merged.csv')
sleep_file = os.path.join(data_path, 'sleepDay_merged.csv')

# Load the datasets
df_activity = pd.read_csv(activity_file)
df_sleep = pd.read_csv(sleep_file)

print("Activity Data Head:")
display(df_activity.head())
print("\nSleep Data Head:")
display(df_sleep.head())

Activity Data Head:


Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863



Sleep Data Head:


Unnamed: 0,Id,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
0,1503960366,4/12/2016 12:00:00 AM,1,327,346
1,1503960366,4/13/2016 12:00:00 AM,2,384,407
2,1503960366,4/15/2016 12:00:00 AM,1,412,442
3,1503960366,4/16/2016 12:00:00 AM,2,340,367
4,1503960366,4/17/2016 12:00:00 AM,1,700,712


# Step 2: Data Cleaning and Merging

To analyze activity and sleep together, we need a single dataset. Here, we'll perform the following cleaning steps:
1.  **Convert date columns** to a consistent `datetime` format.
2.  **Merge** the two DataFrames on `Id` and the date.
3.  **Drop** unnecessary columns.
4.  **Remove** any duplicate entries.

In [3]:
# Clean Activity Data
df_activity['ActivityDate'] = pd.to_datetime(df_activity['ActivityDate'], format='%m/%d/%Y')

# Clean Sleep Data
df_sleep['SleepDay'] = pd.to_datetime(df_sleep['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')
df_sleep['ActivityDate'] = df_sleep['SleepDay'].dt.date
df_sleep['ActivityDate'] = pd.to_datetime(df_sleep['ActivityDate'])

# Merge DataFrames
df_merged = pd.merge(df_activity, df_sleep, on=['Id', 'ActivityDate'], how='inner')

# Final Cleaning
df_merged = df_merged.drop(['TrackerDistance', 'LoggedActivitiesDistance', 'SleepDay'], axis=1)
df_merged.drop_duplicates(inplace=True)

print("Merged and Cleaned Data Info:")
df_merged.info()

print("\nFirst 5 rows of merged data:")
display(df_merged.head())

Merged and Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 410 entries, 0 to 412
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Id                        410 non-null    int64         
 1   ActivityDate              410 non-null    datetime64[ns]
 2   TotalSteps                410 non-null    int64         
 3   TotalDistance             410 non-null    float64       
 4   VeryActiveDistance        410 non-null    float64       
 5   ModeratelyActiveDistance  410 non-null    float64       
 6   LightActiveDistance       410 non-null    float64       
 7   SedentaryActiveDistance   410 non-null    float64       
 8   VeryActiveMinutes         410 non-null    int64         
 9   FairlyActiveMinutes       410 non-null    int64         
 10  LightlyActiveMinutes      410 non-null    int64         
 11  SedentaryMinutes          410 non-null    int64         
 1

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
0,1503960366,2016-04-12,13162,8.5,1.88,0.55,6.06,0.0,25,13,328,728,1985,1,327,346
1,1503960366,2016-04-13,10735,6.97,1.57,0.69,4.71,0.0,21,19,217,776,1797,2,384,407
2,1503960366,2016-04-15,9762,6.28,2.14,1.26,2.83,0.0,29,34,209,726,1745,1,412,442
3,1503960366,2016-04-16,12669,8.16,2.71,0.41,5.04,0.0,36,10,221,773,1863,2,340,367
4,1503960366,2016-04-17,9705,6.48,3.19,0.78,2.51,0.0,38,20,164,539,1728,1,700,712


# Step 3: Exploratory Data Analysis (EDA)

Now we explore the cleaned data to find initial insights and relationships. We will visualize:
1.  A **correlation heatmap** to see how different variables relate to each other.
2.  A **scatter plot** to investigate the relationship between `TotalSteps` and `Calories` burned.
3.  A **scatter plot** to see the connection between `VeryActiveMinutes` and `TotalMinutesAsleep`.

In [4]:
# Enable Plotly trendlines only if statsmodels is available
try:
    import statsmodels.api as sm  # noqa: F401
    trendline_kind = 'ols'
    _trendline_msg = None
except Exception:
    trendline_kind = None
    _trendline_msg = "statsmodels not installed; Plotly trendline disabled. Run `%pip install statsmodels` to enable OLS trendlines."

if _trendline_msg:
    print(_trendline_msg)


In [5]:
# 1. Correlation Heatmap
corr = df_merged.corr(numeric_only=True)
heatmap = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Viridis',
    zmin=-1,
    zmax=1
))
heatmap.update_layout(title='Correlation Matrix of Fitness Data')
heatmap.show()

# 2. Steps vs. Calories
scatter_steps_calories = px.scatter(
    df_merged, 
    x='TotalSteps', 
    y='Calories', 
    trendline=trendline_kind, 
    color='Calories',
    title='Total Steps vs. Calories Burned',
    labels={'TotalSteps': 'Total Steps', 'Calories': 'Calories Burned'}
)
scatter_steps_calories.show()

# 3. Sleep vs. Activity
scatter_sleep_activity = px.scatter(
    df_merged, 
    x='VeryActiveMinutes', 
    y='TotalMinutesAsleep', 
    color='TotalMinutesAsleep',
    color_continuous_scale='Cividis_r',
    title='Sleep Duration vs. Very Active Minutes',
    labels={'VeryActiveMinutes': 'Very Active Minutes', 'TotalMinutesAsleep': 'Minutes Asleep'}
)
scatter_sleep_activity.show()

# Step 4: Feature Engineering & Preprocessing

Before training, we must prepare our data.
1.  **Select Features:** Choose the columns (`features`) that will be used to predict the `target`.
2.  **Split Data:** Divide the data into training and testing sets.
3.  **Scale Features:** Use `StandardScaler` to normalize our features. This is crucial for distance-based algorithms like Ridge and Lasso regression and generally improves model performance.

In [6]:
features = [
    'TotalSteps', 'TotalDistance', 'VeryActiveMinutes', 
    'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes',
    'TotalMinutesAsleep'
]
target = 'Calories'

X = df_merged[features]
y = df_merged[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape: {X_train_scaled.shape}")
print(f"Testing data shape: {X_test_scaled.shape}")

Training data shape: (328, 7)
Testing data shape: (82, 7)


# Step 5: Model Training & Evaluation

Here's where we train our 5 different regression models. For each model, we will:
1.  **Train** it on the scaled training data.
2.  **Make predictions** on the scaled test data.
3.  **Evaluate** its performance using Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared.

Finally, we'll visualize the R-squared scores to easily compare the models.

In [7]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse,
        'R-squared': r2
    })

results_df = pd.DataFrame(results)

print("Model Performance Comparison:")
display(results_df)

# Visualize model performance
fig = px.bar(
    results_df.sort_values('R-squared', ascending=False),
    x='Model',
    y='R-squared',
    color='Model',
    title='Comparison of Model Performance (R-squared)',
    labels={'R-squared': 'R-squared Score'}
)
fig.update_layout(yaxis=dict(range=[0,1]))
fig.show()

Model Performance Comparison:


Unnamed: 0,Model,MAE,MSE,R-squared
0,Linear Regression,253.280243,115211.656255,0.705812
1,Ridge Regression,272.439028,124579.390975,0.681892
2,Lasso Regression,255.357007,116483.11108,0.702566
3,Random Forest,292.989634,143805.590516,0.632799
4,Gradient Boosting,298.646313,148538.596838,0.620714


# Step 6: Saving the Best Model

Based on the R-squared value, we identify the best-performing model. We then save this model and the scaler to disk using `joblib`. These files (`best_model.pkl` and `scaler.pkl`) will be loaded by our Streamlit application to make live predictions.

In [8]:
# Find the best model
best_model_name = results_df.loc[results_df['R-squared'].idxmax()]['Model']
best_model = models[best_model_name]

# Define output paths (in the project root)
model_output_path = os.path.join(project_root, 'best_model.pkl')
scaler_output_path = os.path.join(project_root, 'scaler.pkl')

# Save the model and scaler
joblib.dump(best_model, model_output_path)
joblib.dump(scaler, scaler_output_path)

# Also save the cleaned data for the app
cleaned_data_output_path = os.path.join(data_path, 'cleaned_fitness_data.csv')
df_merged.to_csv(cleaned_data_output_path, index=False)

print(f"Best model '{best_model_name}' saved to: {model_output_path}")
print(f"Scaler saved to: {scaler_output_path}")
print(f"Cleaned data saved to: {cleaned_data_output_path}")

Best model 'Linear Regression' saved to: d:\Python Projects\FitTrack-Data-Analysis\best_model.pkl
Scaler saved to: d:\Python Projects\FitTrack-Data-Analysis\scaler.pkl
Cleaned data saved to: d:\Python Projects\FitTrack-Data-Analysis\data\cleaned_fitness_data.csv
