In [None]:
import pandas as pd
import numpy as np

2. Load Dataset

In [None]:
DATA_PATH = "gym_footfall_dataset.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset loaded successfully.")
print("Shape:", df.shape)

Dataset loaded successfully.
Shape: (1827, 23)


In [None]:
df.head()

Unnamed: 0,date,day_of_week,is_sunday,month,year,is_vacation,is_gym_open,active_student_population,exam_phase,exam_intensity,...,interuni_event,event_boost_factor,gym_adoption_ratio,maintenance_flag,maintenance_severity,equipment_availability_ratio,effective_capacity,base_demand,raw_demand,daily_gym_footfall
0,2021-01-01,4,0,1,2021,0,1,238,midterm,0.75,...,0,1.0,0.367074,1,low,0.9,162,87.363563,45.405505,45
1,2021-01-02,5,0,1,2021,0,1,228,midterm,0.75,...,0,1.0,0.262876,0,none,1.0,180,59.93568,28.721738,28
2,2021-01-03,6,1,1,2021,0,0,214,midterm,0.75,...,0,1.0,0.275735,0,none,1.0,180,59.007228,29.407258,0
3,2021-01-04,0,0,1,2021,0,1,242,midterm,0.75,...,0,1.0,0.361544,0,none,1.0,180,87.493582,36.183734,36
4,2021-01-05,1,0,1,2021,0,1,207,midterm,0.75,...,0,1.0,0.384251,0,none,1.0,180,79.539898,43.27203,43


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827 entries, 0 to 1826
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   date                          1827 non-null   object 
 1   day_of_week                   1827 non-null   int64  
 2   is_sunday                     1827 non-null   int64  
 3   month                         1827 non-null   int64  
 4   year                          1827 non-null   int64  
 5   is_vacation                   1827 non-null   int64  
 6   is_gym_open                   1827 non-null   int64  
 7   active_student_population     1827 non-null   int64  
 8   exam_phase                    1827 non-null   object 
 9   exam_intensity                1827 non-null   float64
 10  academic_load_index           1827 non-null   float64
 11  weather_condition             1827 non-null   object 
 12  weather_impact_factor         1827 non-null   float64
 13  int

In [None]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
date,1827.0,1827.0,2021-01-01,1.0,,,,,,,
day_of_week,1827.0,,,,3.0,2.000548,0.0,1.0,3.0,5.0,6.0
is_sunday,1827.0,,,,0.142857,0.350023,0.0,0.0,0.0,0.0,1.0
month,1827.0,,,,6.520525,3.450954,1.0,4.0,7.0,10.0,12.0
year,1827.0,,,,2023.002189,1.41576,2021.0,2022.0,2023.0,2024.0,2026.0
is_vacation,1827.0,,,,0.16694,0.373025,0.0,0.0,0.0,0.0,1.0
is_gym_open,1827.0,,,,0.714286,0.451878,0.0,0.0,1.0,1.0,1.0
active_student_population,1827.0,,,,225.461412,14.718721,200.0,213.0,226.0,238.0,250.0
exam_phase,1827.0,3.0,midterm,616.0,,,,,,,
exam_intensity,1827.0,,,,0.765463,0.18356,0.55,0.55,0.75,1.0,1.0


Date Handling & Sorting

In [None]:
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

Missing Value Handling

In [None]:
df.isnull().sum()

date                            0
day_of_week                     0
is_sunday                       0
month                           0
year                            0
is_vacation                     0
is_gym_open                     0
active_student_population       0
exam_phase                      0
exam_intensity                  0
academic_load_index             0
weather_condition               0
weather_impact_factor           0
interuni_event                  0
event_boost_factor              0
gym_adoption_ratio              0
maintenance_flag                0
maintenance_severity            0
equipment_availability_ratio    0
effective_capacity              0
base_demand                     0
raw_demand                      0
daily_gym_footfall              0
dtype: int64

In [None]:
# Safety fallback (should not activate)
df = df.dropna()

Data Type Normalization
Convert Binary Flags to Integer

In [None]:
binary_cols = [
    "is_sunday",
    "is_vacation",
    "is_gym_open",
    "interuni_event",
    "maintenance_flag"
]

df[binary_cols] = df[binary_cols].astype(int)
print("Data preprocessing completed successfully.")

Data preprocessing completed successfully.


Convert Categorical Columns to category

In [None]:
categorical_cols = [
    "exam_phase",
    "weather_condition",
    "maintenance_severity"
]

for col in categorical_cols:
    df[col] = df[col].astype("category")

print("Data preprocessing completed successfully.")

Data preprocessing completed successfully.


Target Variable Validation

In [None]:
assert (df["daily_gym_footfall"] >= 0).all(), "Negative footfall detected!"
df["daily_gym_footfall"].describe()

count    1827.000000
mean       34.453749
std        26.805339
min         0.000000
25%         0.000000
50%        37.000000
75%        53.000000
max       120.000000
Name: daily_gym_footfall, dtype: float64

Closed-Day Consistency Check

In [None]:
closed_day_violations = df[
    (df["is_gym_open"] == 0) & (df["daily_gym_footfall"] > 0)
]

print("Closed-day violations:", len(closed_day_violations))


Closed-day violations: 0


Feature Selection for Modeling
Drop Non-Modeling Columns

In [None]:
df_model = df.drop(columns=[
    "base_demand",    # intermediate synthetic variable
    "raw_demand"      # intermediate synthetic variable
])


Encoding Categorical Features

In [None]:
df_encoded = pd.get_dummies(
    df_model,
    columns=categorical_cols,
    drop_first=True
)

In [None]:
# Convert boolean dummy columns to 0/1
bool_cols = df_encoded.select_dtypes(include=["bool"]).columns
df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)

Final Dataset Check

In [None]:
print("Final modeling shape:", df_encoded.shape)
df_encoded.to_csv("gym_footfall_preprocessed.csv", index=False)

Final modeling shape: (1827, 25)
