In [247]:
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
#import h2o
#from h2o.automl import H2OAutoML
warnings.filterwarnings('ignore')
import base64
import pickle


In [248]:
df= pd.read_csv("./../data/day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [249]:
df.shape

(731, 16)

In [250]:
df.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [251]:
df.rename(columns={'instant':'rec_id','dteday':'datetime','yr':'year','mnth':'month','weathersit':'weather_condition',
                       'hum':'humidity','cnt':'total_count'},inplace=True)

In [252]:
df.head()

Unnamed: 0,rec_id,datetime,season,year,month,holiday,weekday,workingday,weather_condition,temp,atemp,humidity,windspeed,casual,registered,total_count
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [253]:
df['datetime']=pd.to_datetime(df.datetime)
df['season']=df.season.astype('category')
df['year']=df.year.astype('category')
df['month']=df.month.astype('category')
df['holiday']=df.holiday.astype('category')
df['weekday']=df.weekday.astype('category')
df['workingday']=df.workingday.astype('category')
df['weather_condition']=df.weather_condition.astype('category')

In [254]:
df.describe()

Unnamed: 0,rec_id,datetime,temp,atemp,humidity,windspeed,casual,registered,total_count
count,731.0,731,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2012-01-01 00:00:00,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
min,1.0,2011-01-01 00:00:00,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2011-07-02 12:00:00,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,2012-01-01 00:00:00,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,2012-07-01 12:00:00,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,2012-12-31 00:00:00,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0
std,211.165812,,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452


In [255]:
df.isnull().sum()

rec_id               0
datetime             0
season               0
year                 0
month                0
holiday              0
weekday              0
workingday           0
weather_condition    0
temp                 0
atemp                0
humidity             0
windspeed            0
casual               0
registered           0
total_count          0
dtype: int64

In [256]:
fig = px.bar(df, x='month', y='total_count', color='season', barmode='group', 
             title='Season', 
             labels={'total_count': '', 'month': 'Month', 'season': 'Season'}, 
             color_discrete_sequence=px.colors.qualitative.Bold
             
            
            )

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [257]:
colors = px.colors.qualitative.Set1[:len(df['weekday'].unique())]

fig = px.bar(df, x='month', y='total_count', color='weekday', barmode='group', 
             title='Weekday', 
             labels={'total_count': 'Total Count', 'month': 'Month', 'weekday': 'Weekday'}, 
             color_discrete_sequence=colors)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [258]:
fig = px.bar(df, x='workingday', y='total_count', color='season', 
             title='Workingday', 
             labels={'total_count': '', 'workingday': 'Workingday', 'season': 'Season'}, 
             color_discrete_sequence=px.colors.qualitative.Bold)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [259]:
fig = px.violin(df, x='holiday', y='total_count', color='season', 
                title='Holiday wise distribution of counts', 
                labels={'total_count': 'Total Count', 'holiday': 'Holiday', 'season': 'Season'},
                color_discrete_sequence=px.colors.qualitative.Bold)

# Show the plot
fig.show()

In [260]:
colors = px.colors.qualitative.Set1[:len(df['weather_condition'].unique())]

fig = px.bar(df, x='weather_condition', y='total_count', color='weather_condition',
             color_discrete_sequence=colors,
             title='Weather condition wise monthly distribution of counts', 
             labels={'total_count': 'Total Count', 'weather_condition': 'Weather Condition'}
            )

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [261]:
fig = px.box(df, y='total_count', 
             title='total_count outliers', 
             labels={'total_count': 'Total Count'}, 
            color_discrete_sequence=px.colors.qualitative.Bold,)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [262]:
colors = px.colors.qualitative.Set1[:len(df[['temp', 'windspeed', 'humidity']].columns)]

fig = px.box(df[['temp', 'windspeed', 'humidity']], 
             title='Temp, Windspeed, Humidity Outliers', 
             labels={'value': 'Value', 'variable': 'Variable'}, 
             color='variable', color_discrete_sequence=colors)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

### Plotting Q-Q Plot

In [263]:
slope, intercept, r_value, p_value, std_err = stats.linregress(qq_plot[0][0], qq_plot[0][1])
x_values = np.array([min(qq_plot[0][0]), max(qq_plot[0][0])])
y_values = slope * x_values + intercept

fig = sp.make_subplots(rows=1, cols=2, subplot_titles=("Histogram of total_count", "Q-Q Plot of total_count"))

histogram_trace = go.Histogram(x=df['total_count'], marker=dict(color='skyblue'))
fig.add_trace(histogram_trace, row=1, col=1)

qq_plot_trace = go.Scatter(x=qq_plot[0][0], y=qq_plot[0][1], mode='markers', marker=dict(color='salmon'))
fig.add_trace(qq_plot_trace, row=1, col=2)

correlation_line_trace = go.Scatter(x=x_values, y=y_values, mode='lines', 
                                    line=dict(color='green', width=2), 
                                    name=f'Correlation Line (r={r_value:.2f})')
fig.add_trace(correlation_line_trace, row=1, col=2)
fig.update_layout(title="Histogram and Q-Q Plot of total_count", height=600, width=1200)
fig.show()

In [264]:
correMtr = df[["temp", "atemp", "humidity", "windspeed", "casual", "registered", "total_count"]].corr()

fig = go.Figure(data=go.Heatmap(
    z=correMtr.values,
    x=correMtr.columns,
    y=correMtr.index,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
    hoverongaps=False))

fig.update_layout(title='Correlation  of attributes',
                  xaxis=dict(title=''),
                  yaxis=dict(title=''))

fig.show()

In [265]:
cat_attributes=['season','holiday','workingday','weather_condition','year']
num_attributes=['temp','windspeed','humidity','month','weekday']

In [266]:
df =pd.get_dummies(df,columns=cat_attributes)
df.dtypes

rec_id                          int64
datetime               datetime64[ns]
month                        category
weekday                      category
temp                          float64
atemp                         float64
humidity                      float64
windspeed                     float64
casual                          int64
registered                      int64
total_count                     int64
season_1                         bool
season_2                         bool
season_3                         bool
season_4                         bool
holiday_0                        bool
holiday_1                        bool
workingday_0                     bool
workingday_1                     bool
weather_condition_1              bool
weather_condition_2              bool
weather_condition_3              bool
year_0                           bool
year_1                           bool
dtype: object

In [267]:
df=df.drop('datetime',axis=1)
df=df.drop('rec_id',axis=1)

In [268]:
df

Unnamed: 0,month,weekday,temp,atemp,humidity,windspeed,casual,registered,total_count,season_1,...,season_4,holiday_0,holiday_1,workingday_0,workingday_1,weather_condition_1,weather_condition_2,weather_condition_3,year_0,year_1
0,1,6,0.344167,0.363625,0.805833,0.160446,331,654,985,True,...,False,True,False,True,False,False,True,False,True,False
1,1,0,0.363478,0.353739,0.696087,0.248539,131,670,801,True,...,False,True,False,True,False,False,True,False,True,False
2,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,True,...,False,True,False,False,True,True,False,False,True,False
3,1,2,0.200000,0.212122,0.590435,0.160296,108,1454,1562,True,...,False,True,False,False,True,True,False,False,True,False
4,1,3,0.226957,0.229270,0.436957,0.186900,82,1518,1600,True,...,False,True,False,False,True,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,12,4,0.254167,0.226642,0.652917,0.350133,247,1867,2114,True,...,False,True,False,False,True,False,True,False,False,True
727,12,5,0.253333,0.255046,0.590000,0.155471,644,2451,3095,True,...,False,True,False,False,True,False,True,False,False,True
728,12,6,0.253333,0.242400,0.752917,0.124383,159,1182,1341,True,...,False,True,False,True,False,False,True,False,False,True
729,12,0,0.255833,0.231700,0.483333,0.350754,364,1432,1796,True,...,False,True,False,True,False,True,False,False,False,True


In [270]:
x= df.iloc[:,0:-1] 

y= df.iloc[:,-14] 
print(y)


0       985
1       801
2      1349
3      1562
4      1600
       ... 
726    2114
727    3095
728    1341
729    1796
730    2729
Name: total_count, Length: 731, dtype: int64


# Linear Model

In [271]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=42)

In [272]:
model= LinearRegression()

In [273]:
model.fit(x_train,y_train)

In [274]:
lr=model.score(x_train,y_train)
print('Accuracy of the model :',lr)
print('Model coefficients :',model.coef_)
print('Model intercept value :',model.intercept_)

Accuracy of the model : 1.0
Model coefficients : [ 1.35466831e-13 -3.62766481e-13 -4.59344287e-12  1.98308294e-12
 -1.10548380e-12  2.15262976e-12  3.33333333e-01  3.33333333e-01
  6.66666667e-01 -9.86854880e-14 -1.61690415e-14  7.09438522e-14
  4.39107154e-14  1.57326148e-15 -1.57327543e-15  1.02910377e-14
 -1.02912845e-14  1.06225325e-14 -6.98331660e-14  5.92107652e-14
 -3.38238992e-14]
Model intercept value : 1.8189894035458565e-12


In [249]:
import math
#Root mean square error 
rmse=math.sqrt(metrics.mean_squared_error(y_test,pred))
#Mean absolute error
mae=metrics.mean_absolute_error(y_test,pred)
print('Root mean square error :',rmse)
print('Mean absolute error :',mae)

Root mean square error : 4814.4357371503875
Mean absolute error : 4381.904795454545


# Random Forest Regressor

In [250]:
model=RandomForestRegressor(n_estimators=200)
model.fit(x_train,y_train)
model_score =model.score(x_train,y_train)
pred=model.predict(x_test)
print('Accuracy of the model :',model_score)

Accuracy of the model : 0.9999708781820529


In [253]:
with open('./../data/model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [252]:
rmse = math.sqrt(metrics.mean_squared_error(y_test,pred))
print('Root mean square error :',rmse)
mae=metrics.mean_absolute_error(y_test,pred)
print('Mean absolute error :',mae)

Root mean square error : 37.656745692324016
Mean absolute error : 13.49436363636364


# H20

In [230]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='6G')  

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,4 hours 40 mins
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,3 months and 28 days
H2O_cluster_name:,H2O_from_python_user_pozv4z
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.646 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [143]:
df = h2o.import_file("./../data/day.csv")
df_train,df_test= df.split_frame(ratios=[.8])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [144]:
y = "cnt" 
x = df.columns 
x.remove(y)
x.remove('casual')
x.remove('registered')

In [145]:
aml = H2OAutoML(max_runtime_secs=300,max_models = 10, seed = 10, verbosity="info", nfolds=2)

In [146]:
aml.train(x=x,y=y, training_frame=df_train)

AutoML progress: |█
19:14:05.200: Project: AutoML_2_20240417_191405
19:14:05.201: Setting stopping tolerance adaptively based on the training frame: 0.04170288281141495
19:14:05.201: Build control seed: 10
19:14:05.202: training frame: Frame key: AutoML_2_20240417_191405_training_py_2_sid_9ce9    cols: 16    rows: 575  chunks: 1    size: 22048  checksum: -7949923955752313434
19:14:05.202: validation frame: NULL
19:14:05.202: leaderboard frame: NULL
19:14:05.202: blending frame: NULL
19:14:05.202: response column: cnt
19:14:05.202: fold column: null
19:14:05.202: weights column: null
19:14:05.206: AutoML: XGBoost is not available; skipping it.
19:14:05.206: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (7g, 10w)]},

key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/5
# GBM base models (used / total),1/1
# DRF base models (used / total),1/2
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,2
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,440.72488,1.3424302,441.6741,439.77563
mean_residual_deviance,408226.22,3523.55,410717.75,405734.7
mse,408226.22,3523.55,410717.75,405734.7
null_deviance,1071459600.0,117866770.0,1154804000.0,988115200.0
r2,0.8898104,0.0103746,0.8971463,0.8824745
residual_deviance,117368776.0,1878999.2,118697420.0,116040120.0
rmse,638.92285,2.7574143,640.8726,636.9731
rmsle,0.2993137,0.1067211,0.3747769,0.2238505


In [None]:
lb = aml.leaderboard

In [None]:
lb
print(lb)

In [147]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])


converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above).



In [148]:
model_ids

['StackedEnsemble_BestOfFamily_1_AutoML_2_20240417_191405',
 'StackedEnsemble_AllModels_1_AutoML_2_20240417_191405',
 'GBM_2_AutoML_2_20240417_191405',
 'GBM_4_AutoML_2_20240417_191405',
 'GBM_3_AutoML_2_20240417_191405',
 'GBM_5_AutoML_2_20240417_191405',
 'DRF_1_AutoML_2_20240417_191405',
 'XRT_1_AutoML_2_20240417_191405',
 'GBM_grid_1_AutoML_2_20240417_191405_model_1',
 'DeepLearning_1_AutoML_2_20240417_191405',
 'GBM_1_AutoML_2_20240417_191405',
 'GLM_1_AutoML_2_20240417_191405']

In [149]:
aml.leader.model_performance(df_test)

In [150]:
h2o.get_model([mid for mid in model_ids if "StackedEnsemble" in mid][0])

key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/5
# GBM base models (used / total),1/1
# DRF base models (used / total),1/2
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,2
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,440.72488,1.3424302,441.6741,439.77563
mean_residual_deviance,408226.22,3523.55,410717.75,405734.7
mse,408226.22,3523.55,410717.75,405734.7
null_deviance,1071459600.0,117866770.0,1154804000.0,988115200.0
r2,0.8898104,0.0103746,0.8971463,0.8824745
residual_deviance,117368776.0,1878999.2,118697420.0,116040120.0
rmse,638.92285,2.7574143,640.8726,636.9731
rmsle,0.2993137,0.1067211,0.3747769,0.2238505


In [151]:
output= h2o.get_model([mid for mid in model_ids if "StackedEnsemble" in mid][0])
output.params

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'StackedEnsemble_BestOfFamily_1_AutoML_2_20240417_191405',
   'type': 'Key<Model>',
   'URL': '/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_2_20240417_191405'},
  'input': None},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_2_20240417_191405_training_py_2_sid_9ce9',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_2_20240417_191405_training_py_2_sid_9ce9'},
  'input': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_2_20240417_191405_training_py_2_sid_9ce9',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_2_20240417_191405_training_py_2_sid_9ce9'}},
 'response_column': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
  

In [152]:
aml.leader

key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/5
# GBM base models (used / total),1/1
# DRF base models (used / total),1/2
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,2
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid
mae,440.72488,1.3424302,441.6741,439.77563
mean_residual_deviance,408226.22,3523.55,410717.75,405734.7
mse,408226.22,3523.55,410717.75,405734.7
null_deviance,1071459600.0,117866770.0,1154804000.0,988115200.0
r2,0.8898104,0.0103746,0.8971463,0.8824745
residual_deviance,117368776.0,1878999.2,118697420.0,116040120.0
rmse,638.92285,2.7574143,640.8726,636.9731
rmsle,0.2993137,0.1067211,0.3747769,0.2238505


In [153]:
y_pred=aml.leader.predict(df_test)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [154]:
print(y_pred)

  predict
  1328.18
  1481.31
  1177.38
  1517.31
  1022.46
  1046.97
  1207.97
  1360.84
  1418.7
  1758.38
[156 rows x 1 column]
