### Imports

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


### Load Preprocessed Dataset

In [11]:
DATA_PATH = "gym_footfall_preprocessed.csv"
df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (1827, 26)


Unnamed: 0,date,day_of_week,is_sunday,month,year,is_vacation,is_gym_open,active_student_population,exam_intensity,academic_load_index,...,effective_capacity,daily_gym_footfall,exam_phase_midterm,exam_phase_none,weather_condition_extreme_heat,weather_condition_heavy_rain,weather_condition_normal,maintenance_severity_low,maintenance_severity_medium,maintenance_severity_none
0,2021-01-01,4,0,1,2021,0,1,238,0.75,0.555301,...,162,45,1,0,0,0,0,1,0,0
1,2021-01-02,5,0,1,2021,0,1,228,0.75,0.440999,...,180,28,1,0,0,0,0,0,0,1
2,2021-01-03,6,1,1,2021,0,0,214,0.75,0.504312,...,180,0,1,0,0,0,0,0,0,1
3,2021-01-04,0,0,1,2021,0,1,242,0.75,0.691768,...,180,36,1,0,0,0,0,0,0,1
4,2021-01-05,1,0,1,2021,0,1,207,0.75,0.528665,...,180,43,1,0,0,0,0,0,0,1


In [12]:
# Drop date column (not used directly in modeling)
df= df.drop(columns=["date"])

In [13]:
# Interaction features
df["exam_weather_interaction"] = (
    df["exam_intensity"] * df["weather_impact_factor"])

df["maintenance_capacity_interaction"] = (
    df["equipment_availability_ratio"] * df["effective_capacity"])

df["load_exam_interaction"] = (
    df["academic_load_index"] * (1 - df["exam_intensity"]))

### Time Aware Train-Test Split

Train: 2021-2024

Test: 2025

In [14]:
train_df = df[df["year"] <= 2024]
test_df  = df[df["year"] == 2025]

X_train = train_df.drop(columns=["daily_gym_footfall"])
y_train = train_df["daily_gym_footfall"]

X_test = test_df.drop(columns=["daily_gym_footfall"])
y_test = test_df["daily_gym_footfall"]

### Baseline Model - Linear Regression
Why Linear Regression?
1. Simple
2. Interpretable
3. Benchmark for improvement

### Train Linear Regression Model

With Ridge Regression

In [15]:
from sklearn.linear_model import Ridge

In [16]:
linreg_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0))])


### Evaluate Linear Regression

In [17]:
linreg_pipeline.fit(X_train, y_train)

y_pred_lr = linreg_pipeline.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print("Improved Linear Regression Performance:")
print(f"MAE  : {mae_lr:.2f}")
print(f"RMSE : {rmse_lr:.2f}")
print(f"R²   : {r2_lr:.3f}")

Improved Linear Regression Performance:
MAE  : 8.67
RMSE : 11.66
R²   : 0.865


The Linear Regression model achieves an R2 of 0.867, indicating that a large portion of the variance in gym footfall is explained by the engineered features. However, residual errors remain due to non-linear relationships and capacity constraints, motivating the use of ensemble-based models.