# Step 1: Project Setup and Data Loading

First, we import the necessary libraries for data manipulation, visualization, and machine learning. We then define the paths to our datasets and load them into pandas DataFrames.

In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define file paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_path = os.path.join(project_root, 'data')
activity_file = os.path.join(data_path, 'dailyActivity_merged.csv')
sleep_file = os.path.join(data_path, 'sleepDay_merged.csv')

# Load the datasets
df_activity = pd.read_csv(activity_file)
df_sleep = pd.read_csv(sleep_file)

print("Activity Data Head:")
display(df_activity.head())
print("\nSleep Data Head:")
display(df_sleep.head())

# Step 2: Data Cleaning and Merging

To analyze activity and sleep together, we need a single dataset. Here, we'll perform the following cleaning steps:
1.  **Convert date columns** to a consistent `datetime` format.
2.  **Merge** the two DataFrames on `Id` and the date.
3.  **Drop** unnecessary columns.
4.  **Remove** any duplicate entries.

In [None]:
# Clean Activity Data
df_activity['ActivityDate'] = pd.to_datetime(df_activity['ActivityDate'], format='%m/%d/%Y')

# Clean Sleep Data
df_sleep['SleepDay'] = pd.to_datetime(df_sleep['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')
df_sleep['ActivityDate'] = df_sleep['SleepDay'].dt.date
df_sleep['ActivityDate'] = pd.to_datetime(df_sleep['ActivityDate'])

# Merge DataFrames
df_merged = pd.merge(df_activity, df_sleep, on=['Id', 'ActivityDate'], how='inner')

# Final Cleaning
df_merged = df_merged.drop(['TrackerDistance', 'LoggedActivitiesDistance', 'SleepDay'], axis=1)
df_merged.drop_duplicates(inplace=True)

print("Merged and Cleaned Data Info:")
df_merged.info()

print("\nFirst 5 rows of merged data:")
display(df_merged.head())

# Step 3: Exploratory Data Analysis (EDA)

Now we explore the cleaned data to find initial insights and relationships. We will visualize:
1.  A **correlation heatmap** to see how different variables relate to each other.
2.  A **scatter plot** to investigate the relationship between `TotalSteps` and `Calories` burned.
3.  A **scatter plot** to see the connection between `VeryActiveMinutes` and `TotalMinutesAsleep`.

In [None]:
# 1. Correlation Heatmap
corr = df_merged.corr(numeric_only=True)
heatmap = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Viridis',
    zmin=-1,
    zmax=1
))
heatmap.update_layout(title='Correlation Matrix of Fitness Data')
heatmap.show()

# 2. Steps vs. Calories
scatter_steps_calories = px.scatter(
    df_merged, 
    x='TotalSteps', 
    y='Calories', 
    trendline='ols', 
    color='Calories',
    title='Total Steps vs. Calories Burned',
    labels={'TotalSteps': 'Total Steps', 'Calories': 'Calories Burned'}
)
scatter_steps_calories.show()

# 3. Sleep vs. Activity
scatter_sleep_activity = px.scatter(
    df_merged, 
    x='VeryActiveMinutes', 
    y='TotalMinutesAsleep', 
    color='TotalMinutesAsleep',
    color_continuous_scale='Cividis_r',
    title='Sleep Duration vs. Very Active Minutes',
    labels={'VeryActiveMinutes': 'Very Active Minutes', 'TotalMinutesAsleep': 'Minutes Asleep'}
)
scatter_sleep_activity.show()

# Step 4: Feature Engineering & Preprocessing

Before training, we must prepare our data.
1.  **Select Features:** Choose the columns (`features`) that will be used to predict the `target`.
2.  **Split Data:** Divide the data into training and testing sets.
3.  **Scale Features:** Use `StandardScaler` to normalize our features. This is crucial for distance-based algorithms like Ridge and Lasso regression and generally improves model performance.

In [None]:
features = [
    'TotalSteps', 'TotalDistance', 'VeryActiveMinutes', 
    'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes',
    'TotalMinutesAsleep'
]
target = 'Calories'

X = df_merged[features]
y = df_merged[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape: {X_train_scaled.shape}")
print(f"Testing data shape: {X_test_scaled.shape}")

# Step 5: Model Training & Evaluation

Here's where we train our 5 different regression models. For each model, we will:
1.  **Train** it on the scaled training data.
2.  **Make predictions** on the scaled test data.
3.  **Evaluate** its performance using Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared.

Finally, we'll visualize the R-squared scores to easily compare the models.

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse,
        'R-squared': r2
    })

results_df = pd.DataFrame(results)

print("Model Performance Comparison:")
display(results_df)

# Visualize model performance
fig = px.bar(
    results_df.sort_values('R-squared', ascending=False),
    x='Model',
    y='R-squared',
    color='Model',
    title='Comparison of Model Performance (R-squared)',
    labels={'R-squared': 'R-squared Score'}
)
fig.update_layout(yaxis=dict(range=[0,1]))
fig.show()

# Step 6: Saving the Best Model

Based on the R-squared value, we identify the best-performing model. We then save this model and the scaler to disk using `joblib`. These files (`best_model.pkl` and `scaler.pkl`) will be loaded by our Streamlit application to make live predictions.

In [None]:
# Find the best model
best_model_name = results_df.loc[results_df['R-squared'].idxmax()]['Model']
best_model = models[best_model_name]

# Define output paths (in the project root)
model_output_path = os.path.join(project_root, 'best_model.pkl')
scaler_output_path = os.path.join(project_root, 'scaler.pkl')

# Save the model and scaler
joblib.dump(best_model, model_output_path)
joblib.dump(scaler, scaler_output_path)

# Also save the cleaned data for the app
cleaned_data_output_path = os.path.join(data_path, 'cleaned_fitness_data.csv')
df_merged.to_csv(cleaned_data_output_path, index=False)

print(f"Best model '{best_model_name}' saved to: {model_output_path}")
print(f"Scaler saved to: {scaler_output_path}")
print(f"Cleaned data saved to: {cleaned_data_output_path}")