In [1]:
# ===============================
# Core libraries
# ===============================
import numpy as np
import pandas as pd
import os
import sys

# ===============================
# Data visualisation
# ===============================
import matplotlib.pyplot as plt
import seaborn as sns

# ===============================
# Scikit-learn: preprocessing
# ===============================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ===============================
# Scikit-learn: models
# ===============================
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# ===============================
# Scikit-learn: evaluation metrics
# ===============================
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ===============================
# MLflow
# ===============================
import mlflow
import mlflow.sklearn

# ===============================
# Model persistence
# ===============================
import joblib

In [28]:
mlflow.set_tracking_uri(uri="http://localhost:8080")

In [29]:
# Create a new MLflow Experiment
mlflow.set_experiment("ML_ASG_KEAGAN")

<Experiment: artifact_location='mlflow-artifacts:/919210661973396070', creation_time=1770341833794, experiment_id='919210661973396070', last_update_time=1770341833794, lifecycle_stage='active', name='ML_ASG_KEAGAN', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [12]:
RANDOM_STATE = 42

In [37]:
# ===============================
# Load dataset
# ===============================
data_path = "day_2011.csv"   # adjust path if needed
df_2011 = pd.read_csv(data_path)

# Preview the data
df_2011.head()

Unnamed: 0,dteday,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,01/01/2011,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,02/01/2011,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,03/01/2011,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,04/01/2011,1,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,05/01/2011,1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600


In [40]:
# Dataset shape and structure
print(df.shape)
df_2011.info()

# Check for missing values
df_2011.isnull().sum()

(366, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      365 non-null    object 
 1   season      365 non-null    int64  
 2   mnth        365 non-null    int64  
 3   holiday     365 non-null    int64  
 4   weekday     365 non-null    int64  
 5   workingday  365 non-null    int64  
 6   weathersit  365 non-null    int64  
 7   temp        365 non-null    float64
 8   atemp       365 non-null    float64
 9   hum         365 non-null    float64
 10  windspeed   365 non-null    float64
 11  cnt         365 non-null    int64  
dtypes: float64(4), int64(7), object(1)
memory usage: 34.3+ KB


dteday        0
season        0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

In [43]:
# ===============================
# Date feature handling (day-first format)
# ===============================
df_2011['dteday'] = pd.to_datetime(
    df_2011['dteday'],
    dayfirst=True,
    errors='coerce'
)
df_2011['year'] = df_2011['dteday'].dt.year
df_2011['month'] = df_2011['dteday'].dt.month
df_2011['day'] = df_2011['dteday'].dt.day

# Drop raw date column
df_2011.drop(columns=['dteday'], inplace=True)

In [44]:
# ===============================
# Drop leakage or irrelevant columns
# ===============================
leakage_cols = ['casual', 'registered']
df_2011.drop(columns=[col for col in leakage_cols if col in df.columns],
        inplace=True)

In [13]:
# ===============================
# Train-test split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE
)

print(X_train.shape, X_test.shape)

(292, 13) (73, 13)


In [14]:
# ===============================
# Feature scaling (for linear models)
# ===============================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# ===============================
# Baseline Model: Linear Regression
# ===============================
lin_reg = LinearRegression()

# Train model
lin_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = lin_reg.predict(X_test_scaled)

# ===============================
# Evaluation metrics (version-safe)
# ===============================
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression RMSE: {rmse:.2f}")
print(f"Linear Regression MAE: {mae:.2f}")
print(f"Linear Regression R¬≤: {r2:.4f}")

Linear Regression RMSE: 692.20
Linear Regression MAE: 504.65
Linear Regression R¬≤: 0.7433


In [25]:
# Start an MLflow run
with mlflow.start_run(run_name="ML2_ASG_MODEL1"):

    # Log model information
    mlflow.log_param("model_type", "LinearRegression")

    # Log evaluation metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Log the trained model (new API)
    mlflow.sklearn.log_model(
        sk_model=lin_reg,
        name="model"
    )

print("Baseline Linear Regression logged to MLflow")

üèÉ View run ML2_ASG_MODEL1 at: http://localhost:8080/#/experiments/919210661973396070/runs/b60260229dbc4d61975972b217936a6e
üß™ View experiment at: http://localhost:8080/#/experiments/919210661973396070
Baseline Linear Regression logged to MLflow


In [26]:
# ===============================
# Improved Model: Random Forest (Depth-Constrained)
# ===============================

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,          # depth constraint (key improvement)
    random_state=RANDOM_STATE
)

# Train model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Random Forest MAE: {mae_rf:.2f}")
print(f"Random Forest R¬≤: {r2_rf:.4f}")

Random Forest RMSE: 550.06
Random Forest MAE: 393.23
Random Forest R¬≤: 0.8379


In [30]:
# ===============================
# MLflow: Improved model logging
# ===============================
with mlflow.start_run(run_name="ML2_ASG_MODEL2"):

    # Log model parameters
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)

    # Log evaluation metrics
    mlflow.log_metric("rmse", rmse_rf)
    mlflow.log_metric("mae", mae_rf)
    mlflow.log_metric("r2", r2_rf)

    # Log model artifact
    mlflow.sklearn.log_model(
        sk_model=rf_model,
        name="model"
    )

print("Improved Random Forest model logged to MLflow")

üèÉ View run ML2_ASG_MODEL2 at: http://localhost:8080/#/experiments/919210661973396070/runs/ed78128bde7945f0a9cfca066efa0515
üß™ View experiment at: http://localhost:8080/#/experiments/919210661973396070
Improved Random Forest model logged to MLflow


In [35]:
# ===============================
# Load dataset
# ===============================
data_path = "day_2012.csv"   # adjust path if needed
df_2012 = pd.read_csv(data_path)

# Preview the data
df_2012.head()

Unnamed: 0,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,year,month,day
0,1,1,0,0,0,1,0.37,0.375621,0.6925,0.192167,2294,2012,1,1
1,1,1,1,1,0,1,0.273043,0.252304,0.381304,0.329665,1951,2012,1,2
2,1,1,0,2,1,1,0.15,0.126275,0.44125,0.365671,2236,2012,1,3
3,1,1,0,3,1,2,0.1075,0.119337,0.414583,0.1847,2368,2012,1,4
4,1,1,0,4,1,1,0.265833,0.278412,0.524167,0.129987,3272,2012,1,5


In [36]:
# Dataset shape and structure
print(df.shape)
df_2012.info()

# Check for missing values
df_2012.isnull().sum()

(366, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      366 non-null    object 
 1   season      366 non-null    int64  
 2   mnth        366 non-null    int64  
 3   holiday     366 non-null    int64  
 4   weekday     366 non-null    int64  
 5   workingday  366 non-null    int64  
 6   weathersit  366 non-null    int64  
 7   temp        366 non-null    float64
 8   atemp       366 non-null    float64
 9   hum         366 non-null    float64
 10  windspeed   366 non-null    float64
 11  cnt         366 non-null    int64  
dtypes: float64(4), int64(7), object(1)
memory usage: 34.4+ KB


dteday        0
season        0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

In [47]:
# Convert 'dteday' to datetime (day-first format)
df_2012['dteday'] = pd.to_datetime(df_2012['dteday'], dayfirst=True, errors='coerce')

# Extract numeric features
df_2012['year'] = df_2012['dteday'].dt.year
df_2012['month'] = df_2012['dteday'].dt.month
df_2012['day'] = df_2012['dteday'].dt.day

# Drop the raw date column
df_2012.drop(columns=['dteday'], inplace=True)

# Drop leakage columns if present
leakage_cols = ['casual', 'registered']
df_2012.drop(columns=[col for col in leakage_cols if col in df_2012.columns], inplace=True)

# Split features and target
X_2012 = df_2012.drop(columns=['cnt'])
y_2012 = df_2012['cnt']

print("2012 dataset cleaned and ready for Random Forest")

‚úÖ 2012 dataset cleaned and ready for Random Forest


In [48]:
# Random Forest (depth-constrained)
rf_2012 = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    random_state=RANDOM_STATE
)

rf_2012.fit(X_2012, y_2012)

y_pred_2012 = rf_2012.predict(X_2012)

rmse_2012 = np.sqrt(mean_squared_error(y_2012, y_pred_2012))
mae_2012 = mean_absolute_error(y_2012, y_pred_2012)
r2_2012 = r2_score(y_2012, y_pred_2012)

print(f"Random Forest (2012) RMSE: {rmse_2012:.2f}, MAE: {mae_2012:.2f}, R¬≤: {r2_2012:.4f}")

Random Forest (2012) RMSE: 553.62, MAE: 426.34, R¬≤: 0.9039


In [49]:
with mlflow.start_run(run_name="ML2_ASG_MODEL3"):

    # Log model info
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)

    # Log evaluation metrics for 2012-trained model
    mlflow.log_metric("rmse_2012_train", rmse_2012)
    mlflow.log_metric("mae_2012_train", mae_2012)
    mlflow.log_metric("r2_2012_train", r2_2012)

    # Log the trained model artifact
    mlflow.sklearn.log_model(
        sk_model=rf_2012,
        name="model_2012"
    )

print("Random Forest (2012) logged to MLflow")

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run ML2_ASG_MODEL3 at: http://localhost:8080/#/experiments/919210661973396070/runs/2ca0ebf6cb5744228b6836ab3c698c66
üß™ View experiment at: http://localhost:8080/#/experiments/919210661973396070
‚úÖ Random Forest (2012) logged to MLflow


In [50]:
# Evaluate 2011-trained RF on 2012
y_pred_drift = rf_model.predict(X_2012)  # rf_model = trained on 2011

rmse_drift = np.sqrt(mean_squared_error(y_2012, y_pred_drift))
mae_drift = mean_absolute_error(y_2012, y_pred_drift)
r2_drift = r2_score(y_2012, y_pred_drift)

with mlflow.start_run(run_name="RF_2011_on_2012"):

    mlflow.log_param("evaluation_type", "drift_test")
    mlflow.log_metric("rmse_drift", rmse_drift)
    mlflow.log_metric("mae_drift", mae_drift)
    mlflow.log_metric("r2_drift", r2_drift)

print("2011 model evaluated on 2012 and logged (drift analysis)")

üèÉ View run RF_2011_on_2012 at: http://localhost:8080/#/experiments/919210661973396070/runs/267804991da34b2fbccf415b56ce5b37
üß™ View experiment at: http://localhost:8080/#/experiments/919210661973396070
‚úÖ 2011 model evaluated on 2012 and logged (drift analysis)
