In [11]:
# Importing packages
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

warnings.filterwarnings("ignore")

In [2]:
# Helper function to standardize columns at beginning - includes trim, normalize spaces, snake-case, lowercase etc...
def standardize_columns(df):
  cols = []
  for c in df.columns:
    c2 = str(c).strip()  # Removes leading and trailing spaces
    c2 = " ".join(c2.split())
    c2 = c2.replace(" ", "_")
    c2 = c2.replace("(", "").replace(")", "")
    c2 = c2.replace("/", "_per_")
    c2 = c2.lower()
    cols.append(c2)  # Adds/appends the transformed column names into cols[]
    
  df.columns = cols
  return df

In [3]:
# Function to identify outliers in the data using IQR(InterQuartile Range)
def cap_outliers_iqr(df, col, k=1.5):
  if col not in df.columns:
    return df
  
  x=df[col].dropna()
  if len(x) < 10:
    return df
  
  q1 = x.quantile(0.25)   # Setting range for 25%
  q3 = x.quantile(0.75)   # Setting range for 75%
  iqr = q3 - q1           # Picking IQR
  
  # Defining high and low values - Values less than low and greater than high are outliers. Hence we remove these values
  low = q1 - k * iqr
  high = q3 + k * iqr
  
  df[col] = df[col].clip(low, high)
  return df

In [4]:
# Takes a list of column names (usually from a DataFrame) and:
# Cleans invalid characters
# Standardizes formatting
# Ensures all column names are unique
# Prevents empty or broken names
# This is commonly used before model training, feature engineering, or saving datasets.

def clean_and_make_unique_feature_names(columns):
  cleaned = []     # final list of cleaned column names
  counts = {}      # tracks how many times a name appears (for uniqueness)
  
  for c in columns:
    c = str(c)     # Convert column name to string
    c = pd.Series([c]).str.replace(r"[^A-Za-z0-9_]", "_", regex=True).iloc[0]  # Replace invalid characters
    c = pd.Series([c]).str.replace(r"_+", "_", regex=True).iloc[0]             # Collapse multiple underscores
    c = c.strip("_")                                                           # Remove underscores from both ends. Removes leading and trailing _
    
    if c == "":
      c = "feature"                                                            # Handle empty names
      
    if c not in counts:                                                        # Ensure uniqueness
      counts[c] = 1
      cleaned.append(c)
    else:
      counts[c] += 1
      cleaned.append(f"{c}__{counts[c]}")
  return cleaned

In [5]:
# It estimates a prediction interval for a regression model using bootstrap resampling of residuals.

def bootstrap_prediction_interval(
    model,
    X_train,          # training features used to compute residuals
    y_train,          # true training targets
    X_new,            # new data point for prediction
    n_boot=200,       # number of bootstrap simulations
    alpha=0.10,       # significance level (10% → 90% interval)
    random_state=42
):
    # Random generator for reproducibility (It makes random operations give the same result every time you run the code.)
    rng = np.random.RandomState(random_state)

    # Model predictions on training data
    y_train_pred = model.predict(X_train)

    # Residuals = actual - predicted
    residuals = np.array(y_train) - np.array(y_train_pred)

    # Point prediction for new input
    y_new_pred = float(model.predict(X_new)[0])

    simulated_predictions = []

    # Bootstrap: add resampled residuals to new prediction
    for _ in range(n_boot):
      sampled_residual = rng.choice(residuals, size=1, replace=True)[0]
      simulated_predictions.append(y_new_pred + sampled_residual)

    # Lower and upper bounds of prediction interval (Lower and upper bounds quantify the uncertainty of a prediction — they tell us the range in which the true value is likely to lie.)
    lower_bound = np.percentile(simulated_predictions, 100 * (alpha / 2))
    upper_bound = np.percentile(simulated_predictions, 100 * (1 - alpha / 2))

    # Return point prediction and prediction interval
    return y_new_pred, lower_bound, upper_bound


#### LOADING THE DATASET

In [6]:
df = pd.read_csv("workout_data.csv")
print("Original shape:", df.shape)

Original shape: (20000, 54)


In [7]:
# Standardize the columns
df = standardize_columns(df=df)
print("After columns standardization: ", df.shape)

After columns standardization:  (20000, 54)


In [8]:
# Basic quality checks
print("Missing values (top 15):")
print(df.isna().sum().sort_values(ascending=False).head(15))

Missing values (top 15):
age                                0
gender                             0
weight_kg                          0
height_m                           0
max_bpm                            0
avg_bpm                            0
resting_bpm                        0
session_duration_hours             0
calories_burned                    0
workout_type                       0
fat_percentage                     0
water_intake_liters                0
workout_frequency_days_per_week    0
experience_level                   0
bmi                                0
dtype: int64


In [9]:
# To check for duplicates and if present then remove them from the dataframe

duplicates = df.duplicated().sum()
print("Duplicate rows: ", duplicates)

if duplicates > 0:
  df = df.drop_duplicates().reset_index(drop=True)
  print(f"After dropping duplicates: {df.shape}")

Duplicate rows:  0


In [12]:
# Type validation + unit checks
# Convert object columns to numeric when most values are numeric
for column_name in df.columns:
    # Process only object (string/mixed-type) columns
    if df[column_name].dtype == "object":

        # Try converting values to numeric; invalid values become NaN
        numeric_version = pd.to_numeric(df[column_name], errors="coerce")
        
        # Check if at least 80% of values are valid numbers
        if numeric_version.notna().mean() >= 0.80:
            
            # Replace original column with numeric version
            df[column_name] = numeric_version

# Sanity check for human height (in meters)
if "height_m" in df.columns:

    # Count rows with unrealistic height values
    invalid_height_count = (
        (df["height_m"] < 1.4) | (df["height_m"] > 2.1)
    ).sum()
    
    print(
        "Height sanity check (1.4–2.1 m): bad_rows =",
        int(invalid_height_count)
    )


# Sanity check for workout/session duration (in hours)
if "session_duration_hours" in df.columns:
    
    # Count sessions that are too short or too long
    invalid_duration_count = (
        (df["session_duration_hours"] < 0.25) |
        (df["session_duration_hours"] > 2.0)
    ).sum()
    
    print(
        "Duration sanity check (0.25–2 hours): bad_rows =",
        int(invalid_duration_count)
    )

Height sanity check (1.4–2.1 m): bad_rows = 0
Duration sanity check (0.25–2 hours): bad_rows = 8
