In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import requests
import zipfile
import io
import json
from sklearn import datasets, ensemble, model_selection
from scipy import stats


In [2]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

In [3]:
raw_data

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [4]:
raw_data["dteday"]

0       2011-01-01
1       2011-01-01
2       2011-01-01
3       2011-01-01
4       2011-01-01
           ...    
17374   2012-12-31
17375   2012-12-31
17376   2012-12-31
17377   2012-12-31
17378   2012-12-31
Name: dteday, Length: 17379, dtype: datetime64[ns]

In [5]:
# raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)),
#                                 axis=1)

In [6]:
ls

Model Building and Exp - Main Notebook.ipynb
Readme.txt
Scaler_ML_system_design_1.ipynb
day.csv
hour.csv


In [7]:
with open('Readme.txt', 'r') as file:
    # Read the entire contents of the file
    content = file.read()

# Print the content of the file
print(content)

Bike Sharing Dataset

Hadi Fanaee-T

Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto
INESC Porto, Campus da FEUP
Rua Dr. Roberto Frias, 378
4200 - 465 Porto, Portugal


Background 

Bike sharing systems are new generation of traditional bike rentals where whole process from membership, rental and return 
back has become automatic. Through these systems, user is able to easily rent a bike from a particular position and return 
back at another position. Currently, there are about over 500 bike-sharing programs around the world which is composed of 
over 500 thousands bicycles. Today, there exists great interest in these systems due to their important role in traffic, 
environmental and health issues. 

Apart from interesting real world applications of bike sharing systems, the characteristics of data being generated by
these systems make them attractive for the research. Opposed to other transport services such as bus or subway, the duration
of tra

# Loading the files : 

In [8]:
df_day = pd.read_csv("day.csv")

In [9]:
df_hour = pd.read_csv("hour.csv")

In [10]:
display(df_day.head())
print(df_day.shape)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


(731, 16)


In [11]:

display(df_hour.head())
print(df_hour.shape)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


(17379, 17)


In [12]:
df_hour["dteday"].nunique()

731

In [13]:
df_hour.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [14]:
df_hour["dteday"] = pd.to_datetime(df_hour["dteday"])
df_day["dteday"] = pd.to_datetime(df_day["dteday"])

In [15]:
df_day.dteday.max()

Timestamp('2012-12-31 00:00:00')

In [16]:
(df_day.dteday).max(),(df_day.dteday).min(),(df_day.dteday).max() - (df_day.dteday).min()

(Timestamp('2012-12-31 00:00:00'),
 Timestamp('2011-01-01 00:00:00'),
 Timedelta('730 days 00:00:00'))

In [17]:
df_day = df_day.set_index("dteday")
df_hour = df_hour.set_index("dteday")

In [18]:
import mlflow

In [19]:
df_day.columns

Index(['instant', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual',
       'registered', 'cnt'],
      dtype='object')

In [20]:
df_day = df_day.drop(["yr",'instant','casual','registered'],axis=1)
df_hour = df_hour.drop(["yr",'instant','casual','registered'],axis=1)

In [21]:
# temp : Normalized temperature in Celsius. The values are divided to 41 (max)
# atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
# hum: Normalized humidity. The values are divided to 100 (max)
# windspeed: Normalized wind speed. The values are divided to 67 (max)

df_day["temp"] = df_day["temp"]*41
df_hour["temp"] = df_hour["temp"]*41

df_day["atemp"] = df_day["atemp"]*50
df_hour["atemp"] = df_hour["atemp"]*50

df_day["hum"] = df_day["hum"]*100
df_hour["hum"] = df_hour["hum"]*100

df_day["windspeed"] = df_day["windspeed"]*67
df_hour["windspeed"] = df_hour["windspeed"]*67



# season : season (1:springer, 2:summer, 3:fall, 4:winter)


df_day["season"] = df_day["season"].replace({1:"springer", 
                          2:"summer", 
                          3:"fall", 
                          4:"winter"})

df_hour["season"] = df_hour["season"].replace({1:"springer", 
                          2:"summer", 
                          3:"fall", 
                          4:"winter"})


# weathersit : 
# 		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
# 		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
# 		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
# 		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog




In [22]:
df_hour

Unnamed: 0_level_0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-01-01,springer,1,0,0,6,0,1,9.84,14.395,81.0,0.0000,16
2011-01-01,springer,1,1,0,6,0,1,9.02,13.635,80.0,0.0000,40
2011-01-01,springer,1,2,0,6,0,1,9.02,13.635,80.0,0.0000,32
2011-01-01,springer,1,3,0,6,0,1,9.84,14.395,75.0,0.0000,13
2011-01-01,springer,1,4,0,6,0,1,9.84,14.395,75.0,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31,springer,12,19,0,1,1,2,10.66,12.880,60.0,11.0014,119
2012-12-31,springer,12,20,0,1,1,2,10.66,12.880,60.0,11.0014,89
2012-12-31,springer,12,21,0,1,1,1,10.66,12.880,60.0,11.0014,90
2012-12-31,springer,12,22,0,1,1,1,10.66,13.635,56.0,8.9981,61


In [23]:
df_hour.groupby("season")["cnt"].mean().sort_values()
# season : season (1:springer, 2:summer, 3:fall, 4:winter)


mean_target = df_hour.groupby("season")["cnt"].mean()

# Map the mean target values to the "season" column
df_hour["season"] = df_hour["season"].map(mean_target)

# distribution tests : 

## ks-test for numerical columns : (checking distributions)

In [24]:
from scipy import stats
import random

alpha = 0.05
rejected = 0  # Null Hypothesis 

numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday','weathersit']

reference = df_hour.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = df_hour.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']


In [25]:
reference.shape,current.shape

((618, 12), (719, 12))

In [26]:
from scipy import stats

# ANSI escape codes for colors
RED = '\033[91m'
GREEN = '\033[92m'
RESET = '\033[0m'  # Reset color to default

for col in numerical_features:
    test = stats.ks_2samp(
                            reference[col],
                            current[col]
                        )
    
    print()
    print()
#     print(f"-------{col}-----")
    
#     print(test)
    
    if test[1] < alpha:
        print(f"{RED}{col} rejected{RESET}", ":", test[1].round(3))
    else:
        print(f"{GREEN}{col} accepted{RESET}", ":", test[1].round(3))




[91mtemp rejected[0m : 0.0


[91matemp rejected[0m : 0.0


[91mhum rejected[0m : 0.001


[91mwindspeed rejected[0m : 0.041


[91mmnth rejected[0m : 0.0


[92mhr accepted[0m : 1.0


[92mweekday accepted[0m : 0.993


## Chisquare test of goodness of fit : 

In [27]:
from scipy import stats

# ANSI escape codes for colors
RED = '\033[91m'
GREEN = '\033[92m'
RESET = '\033[0m'  # Reset color to default

for col in categorical_features:
    test = stats.chi2_contingency(
                            reference[col].value_counts(),
                            current[col].value_counts()
                        )[1]
    
    print()
    print()
#     print(f"-*-------{col}-----")
    
#     print(test)
    
    if test < alpha:
        print(f"{RED}{col} rejected{RESET}", ":", test)
    else:
        print(f"{GREEN}{col} accepted{RESET}", ":", test)




[92mseason accepted[0m : 1.0


[92mholiday accepted[0m : 1.0


[92mworkingday accepted[0m : 1.0


[92mweathersit accepted[0m : 1.0


In [28]:
## Modeling

target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', 'weathersit']


reference = df_hour.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = df_hour.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

reference.shape,current.shape

((618, 12), (719, 12))

In [29]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    reference[numerical_features + categorical_features],
    reference[target],
    test_size=0.2
)

regressor = ensemble.RandomForestRegressor(
                                           random_state = 0,
                                           ccp_alpha=0.01,
                                           n_jobs=-1
                                        )

regressor.fit(X_train, 
              y_train)

preds_test = regressor.predict(X_test)

## validation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

ref_mae=mean_absolute_error(y_test,preds_test)
ref_mse=mean_squared_error(y_test,preds_test)
ref_r2 = r2_score(y_test,preds_test)

print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)


MAE 10.16963488499469
MSE 202.76118803579868
R2 0.9157432231892222


In [30]:
### February data metrics

current_x=current[numerical_features + categorical_features]
current_y=current[target]

current_pred = regressor.predict(current_x)
print("MAE",mean_absolute_error(current_y,current_pred))
print("MSE",mean_squared_error(current_y,current_pred))

r2 = r2_score(current_y,current_pred)
print("R2",r2)

MAE 19.410829150492795
MSE 996.9796323770443
R2 0.7421061384235905


In [31]:
import mlflow
from mlflow.tracking import MlflowClient
import os

In [32]:


experiment_name = "Bicycle–Sharing"

client = MlflowClient()
experiment = client.get_experiment_by_name(experiment_name)

if experiment is None:
    mlflow.create_experiment(experiment_name)

# Set the experiment (it will either be the existing one or the newly created one)
mlflow.set_experiment(experiment_name)


<Experiment: artifact_location='file:///Users/sunnypanchal/Desktop/Neovarsity%20DSML%20research%20papers/SCALER/BusinessCase_Data_Exploration-/Bike-Sharing-Dataset/mlruns/897812197077923964', creation_time=1715447823280, experiment_id='897812197077923964', last_update_time=1715447823280, lifecycle_stage='active', name='Bicycle–Sharing', tags={}>

In [33]:
with mlflow.start_run():
    mlflow.set_tag('mlflow.runName','Refrence_run')
    mlflow.log_metric("MAE",ref_mae)
    mlflow.log_metric("MSE",ref_mse)
    mlflow.log_metric("R2",ref_r2)
    mlflow.sklearn.log_model(regressor, "model")

In [34]:
experiment_batches = [
    ('2011-01-29 00:00:00','2011-02-07 23:00:00'),
    ('2011-02-07 00:00:00','2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00','2011-02-21 23:00:00'),
]

In [35]:
for date in experiment_batches:
    with mlflow.start_run():
        mlflow.set_tag('mlflow.runName','Refrence_run {} to {} '.format(str(date[0]),str(date[0])))
        mlflow.log_param("begin", date[0])
        mlflow.log_param("end", date[1])

        # Get metrics
        current_data=current.loc[date[0]:date[1]]
        current_x=current_data[numerical_features + categorical_features]
        current_y=current_data[target]
        current_pred = regressor.predict(current_x)

        mae=mean_absolute_error(current_y,current_pred)
        mse=mean_squared_error(current_y,current_pred)
        r2 = r2_score(current_y,current_pred)
        
        mlflow.log_metric('MAE', round(mae, 3))
        mlflow.log_metric('MSE', round(mse, 3))
        mlflow.log_metric('R2', round(r2, 3))
