In [1]:
import datetime
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import json

from sklearn import datasets, ensemble, model_selection
from scipy.stats import anderson_ksamp

In [2]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

In [3]:
raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)),
                                axis=1)

In [4]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
from scipy import stats
import random


#Significance level

p_value_thres = 0.05
rejected = 0

numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday']


reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']


for col in numerical_features:
    test=stats.ks_2samp(reference[col],current[col])
    print([col,test])
    if test[1]<p_value_thres:
        rejected+=1
        print("col rejected {}".format(col))


['temp', KstestResult(statistic=0.3268630919426928, pvalue=6.026276035162504e-32, statistic_location=0.22, statistic_sign=1)]
col rejected temp
['atemp', KstestResult(statistic=0.3233995435947986, pvalue=2.855460838157155e-31, statistic_location=0.2273, statistic_sign=1)]
col rejected atemp
['hum', KstestResult(statistic=0.10336407541938417, pvalue=0.0014918623865842317, statistic_location=0.35, statistic_sign=-1)]
col rejected hum
['windspeed', KstestResult(statistic=0.07590999725436713, pvalue=0.04055120974818981, statistic_location=0.3284, statistic_sign=1)]
col rejected windspeed
['mnth', KstestResult(statistic=0.9026425591098748, pvalue=1.472945287377825e-288, statistic_location=1, statistic_sign=1)]
col rejected mnth
['hr', KstestResult(statistic=0.011077053260776609, pvalue=0.999999999980103, statistic_location=3, statistic_sign=-1)]
['weekday', KstestResult(statistic=0.022863019926092965, pvalue=0.9929320453958753, statistic_location=1, statistic_sign=-1)]


In [6]:
from scipy.stats import chi2_contingency
def drift_chisquare(sample1, sample2):
    return chi2_contingency([sample1, sample2])[1]

In [7]:
for col in categorical_features:
    print(col, set(reference[col].values))

season {1}
holiday {0, 1}
workingday {0, 1}


In [8]:
for col in categorical_features:
    test=drift_chisquare(reference[col].value_counts(),current[col].value_counts())
    print([col,test])
    if test<p_value_thres:
        rejected+=1
        print("col rejected {}".format(col))


['season', 1.0]
['holiday', 0.6986573626612528]
['workingday', 0.5917879941201512]


In [9]:
## Modeling

target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]#'weathersit']
reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    reference[numerical_features + categorical_features],
    reference[target],
    test_size=0.3
)
regressor = ensemble.RandomForestRegressor(random_state = 0)

regressor.fit(X_train, y_train)

preds_test = regressor.predict(X_test)

## validation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

ref_mae=mean_absolute_error(y_test,preds_test)
ref_mse=mean_squared_error(y_test,preds_test)
ref_r2 = r2_score(y_test,preds_test)

print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)


MAE 11.561559139784945
MSE 318.20751666666666
R2 0.8669866653805993


In [10]:
### Feb data metrics
current_x=current[numerical_features + categorical_features]
current_y=current[target]

current_pred = regressor.predict(current_x)
print("MAE",mean_absolute_error(current_y,current_pred))
print("MSE",mean_squared_error(current_y,current_pred))

r2 = r2_score(current_y,current_pred)
print(r2)

MAE 20.500931849791375
MSE 1128.504183727399
0.708084004632376


In [11]:
## MLFLOW use case

In [12]:
import mlflow
#import mlflow.sklearn
from mlflow.tracking import MlflowClient
import os
mlflow.set_experiment("Bicycle–Sharing")

2024/05/11 12:49:53 INFO mlflow.tracking.fluent: Experiment with name 'Bicycle–Sharing' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/sunnypanchal/Desktop/Neovarsity%20DSML%20research%20papers/SCALER/BusinessCase_Data_Exploration-/Bike-Sharing-Dataset/mlruns/236125827191098616', creation_time=1715446193178, experiment_id='236125827191098616', last_update_time=1715446193178, lifecycle_stage='active', name='Bicycle–Sharing', tags={}>

In [13]:
with mlflow.start_run():
    mlflow.set_tag('mlflow.runName','Refrence_run')
    mlflow.log_metric("MAE",ref_mae)
    mlflow.log_metric("MSE",ref_mse)
    mlflow.log_metric("R2",ref_r2)
    mlflow.sklearn.log_model(regressor, "model")



In [14]:
experiment_batches = [
    ('2011-01-29 00:00:00','2011-02-07 23:00:00'),
    ('2011-02-07 00:00:00','2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00','2011-02-21 23:00:00'),
]

In [15]:
for date in experiment_batches:
    with mlflow.start_run():
        mlflow.set_tag('mlflow.runName','Refrence_run {} to {} '.format(str(date[0]),str(date[0])))
        # Log parameters
        mlflow.log_param("begin", date[0])
        mlflow.log_param("end", date[1])

        # Get metrics
        current_data=current.loc[date[0]:date[1]]
        current_x=current_data[numerical_features + categorical_features]
        current_y=current_data[target]
        current_pred = regressor.predict(current_x)

        mae=mean_absolute_error(current_y,current_pred)
        mse=mean_squared_error(current_y,current_pred)
        r2 = r2_score(current_y,current_pred)
        
        mlflow.log_metric('MAE', round(mae, 3))
        mlflow.log_metric('MSE', round(mse, 3))
        mlflow.log_metric('R2', round(r2, 3))
