In [181]:
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#import h2o
#from h2o.automl import H2OAutoML
warnings.filterwarnings('ignore')
import base64
import pickle


In [182]:
df= pd.read_csv("./../data/day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [183]:
df.shape

(731, 16)

In [184]:
df.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [185]:
df.rename(columns={'instant':'rec_id','dteday':'datetime','yr':'year','mnth':'month','weathersit':'weather_condition',
                       'hum':'humidity','cnt':'total_count'},inplace=True)

In [186]:
df.head()

Unnamed: 0,rec_id,datetime,season,year,month,holiday,weekday,workingday,weather_condition,temp,atemp,humidity,windspeed,casual,registered,total_count
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [187]:
df['datetime']=pd.to_datetime(df.datetime)
df['season']=df.season.astype('category')
df['year']=df.year.astype('category')
df['month']=df.month.astype('category')
df['holiday']=df.holiday.astype('category')
df['weekday']=df.weekday.astype('category')
df['workingday']=df.workingday.astype('category')
df['weather_condition']=df.weather_condition.astype('category')

In [188]:
df.describe()

Unnamed: 0,rec_id,datetime,temp,atemp,humidity,windspeed,casual,registered,total_count
count,731.0,731,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2012-01-01 00:00:00,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
min,1.0,2011-01-01 00:00:00,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2011-07-02 12:00:00,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,2012-01-01 00:00:00,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,2012-07-01 12:00:00,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,2012-12-31 00:00:00,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0
std,211.165812,,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452


In [189]:
df.isnull().sum()

rec_id               0
datetime             0
season               0
year                 0
month                0
holiday              0
weekday              0
workingday           0
weather_condition    0
temp                 0
atemp                0
humidity             0
windspeed            0
casual               0
registered           0
total_count          0
dtype: int64

In [190]:
fig = px.bar(df, x='month', y='total_count', color='season', barmode='group', 
             title='Season', 
             labels={'total_count': '', 'month': 'Month', 'season': 'Season'}, 
             color_discrete_sequence=px.colors.qualitative.Bold
             
            
            )

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [191]:
colors = px.colors.qualitative.Set1[:len(df['weekday'].unique())]

fig = px.bar(df, x='month', y='total_count', color='weekday', barmode='group', 
             title='Weekday', 
             labels={'total_count': 'Total Count', 'month': 'Month', 'weekday': 'Weekday'}, 
             color_discrete_sequence=colors)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [192]:
fig = px.bar(df, x='workingday', y='total_count', color='season', 
             title='Workingday', 
             labels={'total_count': '', 'workingday': 'Workingday', 'season': 'Season'}, 
             color_discrete_sequence=px.colors.qualitative.Bold)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [193]:
fig = px.violin(df, x='holiday', y='total_count', color='season', 
                title='Holiday wise distribution of counts', 
                labels={'total_count': 'Total Count', 'holiday': 'Holiday', 'season': 'Season'},
                color_discrete_sequence=px.colors.qualitative.Bold)

# Show the plot
fig.show()

In [194]:
colors = px.colors.qualitative.Set1[:len(df['weather_condition'].unique())]

fig = px.bar(df, x='weather_condition', y='total_count', color='weather_condition',
             color_discrete_sequence=colors,
             title='Weather condition wise monthly distribution of counts', 
             labels={'total_count': 'Total Count', 'weather_condition': 'Weather Condition'}
            )

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [195]:
fig = px.box(df, y='total_count', 
             title='total_count outliers', 
             labels={'total_count': 'Total Count'}, 
            color_discrete_sequence=px.colors.qualitative.Bold,)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

In [196]:
colors = px.colors.qualitative.Set1[:len(df[['temp', 'windspeed', 'humidity']].columns)]

fig = px.box(df[['temp', 'windspeed', 'humidity']], 
             title='Temp, Windspeed, Humidity Outliers', 
             labels={'value': 'Value', 'variable': 'Variable'}, 
             color='variable', color_discrete_sequence=colors)

fig.update_layout(title_font_size=20, title_font_family='Arial')
fig.show()

### Plotting Q-Q Plot

In [197]:
slope, intercept, r_value, p_value, std_err = stats.linregress(qq_plot[0][0], qq_plot[0][1])
x_values = np.array([min(qq_plot[0][0]), max(qq_plot[0][0])])
y_values = slope * x_values + intercept

fig = sp.make_subplots(rows=1, cols=2, subplot_titles=("Histogram of total_count", "Q-Q Plot of total_count"))

histogram_trace = go.Histogram(x=df['total_count'], marker=dict(color='skyblue'))
fig.add_trace(histogram_trace, row=1, col=1)

qq_plot_trace = go.Scatter(x=qq_plot[0][0], y=qq_plot[0][1], mode='markers', marker=dict(color='salmon'))
fig.add_trace(qq_plot_trace, row=1, col=2)

correlation_line_trace = go.Scatter(x=x_values, y=y_values, mode='lines', 
                                    line=dict(color='green', width=2), 
                                    name=f'Correlation Line (r={r_value:.2f})')
fig.add_trace(correlation_line_trace, row=1, col=2)
fig.update_layout(title="Histogram and Q-Q Plot of total_count", height=600, width=1200)
fig.show()

NameError: name 'qq_plot' is not defined

In [198]:
correMtr = df[["temp", "atemp", "humidity", "windspeed", "casual", "registered", "total_count"]].corr()

fig = go.Figure(data=go.Heatmap(
    z=correMtr.values,
    x=correMtr.columns,
    y=correMtr.index,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
    hoverongaps=False))

fig.update_layout(title='Correlation  of attributes',
                  xaxis=dict(title=''),
                  yaxis=dict(title=''))

fig.show()

In [199]:
cat_attributes=['season','holiday','workingday','weather_condition','year',]
num_attributes=['temp','windspeed','humidity','month','weekday','casual','registered']

In [200]:
df =pd.get_dummies(df,columns=cat_attributes)
df.dtypes

rec_id                          int64
datetime               datetime64[ns]
month                        category
weekday                      category
temp                          float64
atemp                         float64
humidity                      float64
windspeed                     float64
casual                          int64
registered                      int64
total_count                     int64
season_1                         bool
season_2                         bool
season_3                         bool
season_4                         bool
holiday_0                        bool
holiday_1                        bool
workingday_0                     bool
workingday_1                     bool
weather_condition_1              bool
weather_condition_2              bool
weather_condition_3              bool
year_0                           bool
year_1                           bool
dtype: object

In [201]:


df.drop(columns=['year_0', 'datetime', 'year_1','rec_id','casual','registered'], inplace=True)

In [202]:
df

Unnamed: 0,month,weekday,temp,atemp,humidity,windspeed,total_count,season_1,season_2,season_3,season_4,holiday_0,holiday_1,workingday_0,workingday_1,weather_condition_1,weather_condition_2,weather_condition_3
0,1,6,0.344167,0.363625,0.805833,0.160446,985,True,False,False,False,True,False,True,False,False,True,False
1,1,0,0.363478,0.353739,0.696087,0.248539,801,True,False,False,False,True,False,True,False,False,True,False
2,1,1,0.196364,0.189405,0.437273,0.248309,1349,True,False,False,False,True,False,False,True,True,False,False
3,1,2,0.200000,0.212122,0.590435,0.160296,1562,True,False,False,False,True,False,False,True,True,False,False
4,1,3,0.226957,0.229270,0.436957,0.186900,1600,True,False,False,False,True,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,12,4,0.254167,0.226642,0.652917,0.350133,2114,True,False,False,False,True,False,False,True,False,True,False
727,12,5,0.253333,0.255046,0.590000,0.155471,3095,True,False,False,False,True,False,False,True,False,True,False
728,12,6,0.253333,0.242400,0.752917,0.124383,1341,True,False,False,False,True,False,True,False,False,True,False
729,12,0,0.255833,0.231700,0.483333,0.350754,1796,True,False,False,False,True,False,True,False,True,False,False


In [203]:

X = df.drop(columns=['total_count'])
y = df['total_count']

# Linear Model

In [204]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=42)

<IPython.core.display.Javascript object>

TypeError: Expected sequence or array-like, got <class 'NoneType'>

In [205]:
model= LinearRegression()

<IPython.core.display.Javascript object>

In [206]:
model.fit(x_train,y_train)

In [207]:
lr=model.score(x_train,y_train)
print('Accuracy of the model :',lr)
print('Model coefficients :',model.coef_)
print('Model intercept value :',model.intercept_)

Accuracy of the model : 0.8168856718870386
Model coefficients : [  -25.70728219    69.2743212   2364.09928112  3216.21303342
  -991.55313076 -2369.19150262  -959.00545157   226.04568759
   -54.07118179   787.03094577   220.75533267  -220.75533267
   -77.05846074    77.05846074   776.69765227   263.49319497
 -1040.19084724  -990.23530671   990.23530671]
Model intercept value : 2044.8345088579717


# Random Forest Regressor

In [208]:
model=RandomForestRegressor(n_estimators=200)
model.fit(x_train,y_train)
model_score =model.score(x_train,y_train)
pred=model.predict(x_test)
print('Accuracy of the model :',model_score)
print(x_test.iloc[0])

Accuracy of the model : 0.9806398497562846
month                        12
weekday                       2
temp                   0.475833
atemp                  0.469054
humidity                0.73375
windspeed              0.174129
season_1                  False
season_2                  False
season_3                  False
season_4                   True
holiday_0                  True
holiday_1                 False
workingday_0              False
workingday_1               True
weather_condition_1        True
weather_condition_2       False
weather_condition_3       False
year_0                    False
year_1                     True
Name: 703, dtype: object


In [209]:
with open('./../data/model.pkl', 'wb') as file:
    pickle.dump(model, file)

