 # ML2 Assignment

## Task 1 Model Development and Experiment Design

You are required to develop and evaluate machine learning models for predicting daily bike
rental demand using only ‚Äúday_2011.csv‚Äù dataset through experiment design. This task
focuses on model development and experimentation.

In [4]:
!pip install mlflow scikit-learn pandas numpy matplotlib seaborn
!pip install charset_normalizer
!pip install chardet
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.datasets import load_iris, make_classification
import matplotlib.pyplot as plt
from mlflow.models import infer_signature
import seaborn as sns
from sklearn.metrics import (
    mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score
)
import warnings
warnings.filterwarnings('ignore')
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("task 1 Model Development and Experiment Design")



<Experiment: artifact_location='mlflow-artifacts:/454394573250837312', creation_time=1769997916648, experiment_id='454394573250837312', last_update_time=1769997916648, lifecycle_stage='active', name='task 1 Model Development and Experiment Design', tags={'mlflow.experimentKind': 'custom_model_development'}>

### Data Preprocessing

Perform any necessary data preprocessing on the dataset. Some hints on
data preprocessing:
‚Ä¢ If using dteday, you must extract numeric features (e.g.,
month/weekday) and drop raw date

In [7]:
data = pd.read_csv('day_2011.csv')
data.head(20)

Unnamed: 0,dteday,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,01/01/2011,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,02/01/2011,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,03/01/2011,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,04/01/2011,1,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,05/01/2011,1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600
5,06/01/2011,1,1,0,4,1,1,0.204348,0.233209,0.518261,0.089565,1606
6,07/01/2011,1,1,0,5,1,2,0.196522,0.208839,0.498696,0.168726,1510
7,08/01/2011,1,1,0,6,0,2,0.165,0.162254,0.535833,0.266804,959
8,09/01/2011,1,1,0,0,0,1,0.138333,0.116175,0.434167,0.36195,822
9,10/01/2011,1,1,0,1,1,1,0.150833,0.150888,0.482917,0.223267,1321


In [8]:
#Check for null values
data.isnull().mean().sort_values(ascending=True)

dteday        0.0
season        0.0
mnth          0.0
holiday       0.0
weekday       0.0
workingday    0.0
weathersit    0.0
temp          0.0
atemp         0.0
hum           0.0
windspeed     0.0
cnt           0.0
dtype: float64

In [9]:
#Change dteday to datetime
data['dteday'] = pd.to_datetime(data['dteday'], dayfirst=True)
data

Unnamed: 0,dteday,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,2011-01-02,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,2011-01-03,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,2011-01-04,1,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,1562
4,2011-01-05,1,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,1600
...,...,...,...,...,...,...,...,...,...,...,...,...
360,2011-12-27,1,12,0,2,1,2,0.325000,0.327633,0.762500,0.188450,1162
361,2011-12-28,1,12,0,3,1,1,0.299130,0.279974,0.503913,0.293961,2302
362,2011-12-29,1,12,0,4,1,1,0.248333,0.263892,0.574167,0.119412,2423
363,2011-12-30,1,12,0,5,1,1,0.311667,0.318812,0.636667,0.134337,2999


In [10]:
#Create new columns to extract numeric features
data['year'] = data['dteday'].dt.year
data['month'] = data['dteday'].dt.month
data['day'] = data['dteday'].dt.day
data['weekday'] = data['dteday'].dt.weekday   # Monday=0, Sunday=6
data['is_weekend'] = data['weekday'].isin([5, 6]).astype(int)
data

Unnamed: 0,dteday,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,year,month,day,is_weekend
0,2011-01-01,1,1,0,5,0,2,0.344167,0.363625,0.805833,0.160446,985,2011,1,1,1
1,2011-01-02,1,1,0,6,0,2,0.363478,0.353739,0.696087,0.248539,801,2011,1,2,1
2,2011-01-03,1,1,0,0,1,1,0.196364,0.189405,0.437273,0.248309,1349,2011,1,3,0
3,2011-01-04,1,1,0,1,1,1,0.200000,0.212122,0.590435,0.160296,1562,2011,1,4,0
4,2011-01-05,1,1,0,2,1,1,0.226957,0.229270,0.436957,0.186900,1600,2011,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2011-12-27,1,12,0,1,1,2,0.325000,0.327633,0.762500,0.188450,1162,2011,12,27,0
361,2011-12-28,1,12,0,2,1,1,0.299130,0.279974,0.503913,0.293961,2302,2011,12,28,0
362,2011-12-29,1,12,0,3,1,1,0.248333,0.263892,0.574167,0.119412,2423,2011,12,29,0
363,2011-12-30,1,12,0,4,1,1,0.311667,0.318812,0.636667,0.134337,2999,2011,12,30,0


In [11]:
#drop unnecessary columns
data = data.drop(columns=['dteday'])
data = data.drop(columns=['month'])
data = data.drop(columns=['workingday'])
data

Unnamed: 0,season,mnth,holiday,weekday,weathersit,temp,atemp,hum,windspeed,cnt,year,day,is_weekend
0,1,1,0,5,2,0.344167,0.363625,0.805833,0.160446,985,2011,1,1
1,1,1,0,6,2,0.363478,0.353739,0.696087,0.248539,801,2011,2,1
2,1,1,0,0,1,0.196364,0.189405,0.437273,0.248309,1349,2011,3,0
3,1,1,0,1,1,0.200000,0.212122,0.590435,0.160296,1562,2011,4,0
4,1,1,0,2,1,0.226957,0.229270,0.436957,0.186900,1600,2011,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,1,12,0,1,2,0.325000,0.327633,0.762500,0.188450,1162,2011,27,0
361,1,12,0,2,1,0.299130,0.279974,0.503913,0.293961,2302,2011,28,0
362,1,12,0,3,1,0.248333,0.263892,0.574167,0.119412,2423,2011,29,0
363,1,12,0,4,1,0.311667,0.318812,0.636667,0.134337,2999,2011,30,0


In [12]:
#Check the weather situation
data['weathersit'].values

array([2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1,
       1, 1, 2, 3, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1,
       1, 2, 3, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1,
       2, 3, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 3, 1, 1, 2, 1,
       1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 1, 2, 3, 3, 3, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2,
       2, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2,

In [13]:
#Categorical columns
categorical_cols = ['season', 'weathersit', 'weekday','is_weekend', 'holiday']

data = pd.get_dummies(data, columns=categorical_cols)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mnth          365 non-null    int64  
 1   temp          365 non-null    float64
 2   atemp         365 non-null    float64
 3   hum           365 non-null    float64
 4   windspeed     365 non-null    float64
 5   cnt           365 non-null    int64  
 6   year          365 non-null    int32  
 7   day           365 non-null    int32  
 8   season_1      365 non-null    bool   
 9   season_2      365 non-null    bool   
 10  season_3      365 non-null    bool   
 11  season_4      365 non-null    bool   
 12  weathersit_1  365 non-null    bool   
 13  weathersit_2  365 non-null    bool   
 14  weathersit_3  365 non-null    bool   
 15  weekday_0     365 non-null    bool   
 16  weekday_1     365 non-null    bool   
 17  weekday_2     365 non-null    bool   
 18  weekday_3     365 non-null    

In [14]:
#Rename the columns
data['Clear'] = data['weathersit_1']
data['Mist'] = data['weathersit_2']
data['Light Snow'] = data['weathersit_3'] #Do not have 4th weather situation
data['Spring'] = data['season_1']
data['Summer'] = data['season_2']
data['Fall'] = data['season_3']
data['Winter'] = data['season_4']
data['Monday'] = data['weekday_0']
data['Tuesday'] = data['weekday_1']
data['Wednesday'] = data['weekday_2']
data['Thursday'] = data['weekday_3']
data['Friday'] = data['weekday_4']
data['Saturday'] = data['weekday_5']
data['Sunday'] = data['weekday_6']
data['Weekend'] = data['is_weekend_1']
data['Weekday'] = data['is_weekend_0']
data['Not Holiday'] = data['holiday_0']
data['Holiday'] = data['holiday_1']
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 44 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mnth          365 non-null    int64  
 1   temp          365 non-null    float64
 2   atemp         365 non-null    float64
 3   hum           365 non-null    float64
 4   windspeed     365 non-null    float64
 5   cnt           365 non-null    int64  
 6   year          365 non-null    int32  
 7   day           365 non-null    int32  
 8   season_1      365 non-null    bool   
 9   season_2      365 non-null    bool   
 10  season_3      365 non-null    bool   
 11  season_4      365 non-null    bool   
 12  weathersit_1  365 non-null    bool   
 13  weathersit_2  365 non-null    bool   
 14  weathersit_3  365 non-null    bool   
 15  weekday_0     365 non-null    bool   
 16  weekday_1     365 non-null    bool   
 17  weekday_2     365 non-null    bool   
 18  weekday_3     365 non-null    

### Experiment Design

You are required to actively design and conduct experiments to develop
a suitable prediction model using day_2011.csv.
Specifically, you must:
‚Ä¢ Use Linear Regression to develop the baseline model.
‚Ä¢ Develop one improvement strategy such as tree-based model
with depth constraints or regularised regression
For each experiment, explain:
‚Ä¢ Why it is performed
‚Ä¢ What improvement or trade-off is expected

In [17]:
from sklearn.model_selection import train_test_split
X = data.drop('cnt', axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Features: {X_train.shape[1]}")

Training samples: 292
Testing samples: 73
Features: 43


In [18]:
X_train.head()

Unnamed: 0,mnth,temp,atemp,hum,windspeed,year,day,season_1,season_2,season_3,...,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Weekend,Weekday,Not Holiday,Holiday
211,7,0.805833,0.729796,0.480833,0.164813,2011,31,False,False,True,...,False,False,False,False,False,True,True,False,True,False
340,12,0.41,0.400246,0.970417,0.266175,2011,7,False,False,False,...,False,True,False,False,False,False,False,True,True,False
202,7,0.848333,0.840896,0.580417,0.1331,2011,22,False,False,True,...,False,False,False,True,False,False,False,True,True,False
75,3,0.415,0.410333,0.602917,0.209579,2011,17,True,False,False,...,False,False,True,False,False,False,False,True,True,False
84,3,0.265833,0.257571,0.394167,0.209571,2011,26,False,True,False,...,False,False,False,False,True,False,True,False,True,False


In [19]:
#Linear regression as baseline
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [20]:
#Linear regression model evaluation
y_pred = model.predict(X_train)  # Predict on the train set
print('training accuracy is: ', model.score(X_train,y_train))
print('testing accuracy is: ', model.score(X_test,y_test))

training accuracy is:  0.8224086466222853
testing accuracy is:  0.7403140386127525


In [21]:
# Try regularised regression
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1)
ridge.fit(X_train, y_train)

ridge.score(X_train, y_train), ridge.score(X_test, y_test)

(0.8122901558220414, 0.7584550130003851)

In [22]:
#Tune regularisation strength
from sklearn.linear_model import RidgeCV

alphas = [0.01, 0.1, 1, 10, 100]
ridge_cv = RidgeCV(alphas=alphas)
ridge_cv.fit(X_train, y_train)

ridge_cv.alpha_
ridge_cv.score(X_test, y_test)

0.7440846572666122

In [23]:
#Try tree-based model with depth constraints
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(
    max_depth=4,
    min_samples_leaf=20,
    random_state=42
)

tree.fit(X_train, y_train)

tree.score(X_train, y_train), tree.score(X_test, y_test)

(0.7784733024603685, 0.6851699606531438)

In [24]:
#Try random forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)

rf.fit(X_train, y_train)

rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.9081488620356043, 0.8312274754095471)

In [26]:
with mlflow.start_run(run_name="baseline_model") as base_run:
    base_run_id = base_run.info.run_id

    #Tag: describes the role of this run. Here setting the tag "role" = "fraud_baseline" helps to later identify runs that served as baseline
    mlflow.set_tag("role","baseline_regression")
    
    # Log various paramater info in MLFlow
    mlflow.log_param("model_type","LinearRegression")
    mlflow.log_param("feature_count",X_train.shape[1])

    #Train the model
    model = LinearRegression()
    model.fit(X_train,y_train)

    #The model prdicts on test data and key evaluation metrics are obtained.
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test,pred)
    r2 = r2_score(y_test,pred)

    #log the evaluation metrics in MLFlow
    mlflow.log_metric("baseline_mse",mse)
    mlflow.log_metric("baseline_r2",r2)

    #Log the baseline model to MLFlow
    sig = infer_signature(X_train,pred)
    mlflow.sklearn.log_model(model, name="regression_baseline_model",
                             input_example=X_test.iloc[:5],
                             signature=sig)

    #Log the artifacts (histogram image) in MLFlow
    #mlflow.log_artifact("baseline_medinc_hist.png")

print("Baseline Regression Run ID:", base_run_id)
print("Baseline MSE on original data:",mse)
print("Baseline R2 on original data:",r2)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

üèÉ View run baseline_model at: http://localhost:5000/#/experiments/454394573250837312/runs/c3ab059dbb3840a6bb1858aec83f466b
üß™ View experiment at: http://localhost:5000/#/experiments/454394573250837312
Baseline Regression Run ID: c3ab059dbb3840a6bb1858aec83f466b
Baseline MSE on original data: 484727.94348642457
Baseline R2 on original data: 0.7403140386127525
