## Seoul bike sharing demand prediction

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt


In [None]:
df=pd.read_csv("seoul_projects/seoulbikeencoded.csv")
df.head(1)

In [None]:
df=df.rename(columns={'Temperature(°C)':'Temperature','Humidity(%)':'Humidity',
                      'Wind speed (m/s)':'Wind speed','Visibility (10m)':'Visibility',
                      'Dew point temperature(°C)':'Dew point temperature',
                      'Solar Radiation (MJ/m2)':'Solar Radiation',
                      'Rainfall(mm)':'Rainfall','Snowfall (cm)':'Snowfall'})

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df=df.astype({'Rented Bike Count':'float','Hour':'object'})

In [None]:
df.info()

In [None]:
df.describe().style.background_gradient()

In [None]:
df.nunique().sort_values(ascending=True)

In [None]:
df.isnull().sum()

In [None]:
import missingno as msno

In [None]:
msno.matrix(df,labels=[df.columns], figsize=(30,16), fontsize=12)

In [None]:
import seaborn as sns

## Bar plot

In [None]:

plt.figure(figsize=(18,18))
for i,col in enumerate(df.select_dtypes(include=['float64','int']).columns):
    plt.rcParams['axes.facecolor']='black'
    ax=plt.subplot(4,3,i+1)
    sns.barplot(data=df,x='Hour',y=col,ax=ax,edgecolor="black",palette='viridis_r')
    plt.suptitle('Data distribution of continuous variables')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(18,18))
for i, col in enumerate(df.select_dtypes(include=['float64','int64']).columns):
    plt.rcParams['axes.facecolor']='white'
    ax=plt.subplot(5,2,i+1)
    sns.boxplot(data=df,x=col,ax=ax,color='blue')
    plt.suptitle('Box plot of continuous variables')
    plt.tight_layout()

In [None]:
#selecting variables that have data type float and int
var= list(df.select_dtypes(include=['float64','int64']).columns)

In [None]:
from sklearn.preprocessing import PowerTransformer
sc_X=PowerTransformer(method='yeo-johnson')
df[var]=sc_X.fit_transform(df[var])

In [None]:
plt.figure(figsize=(18,18))
for i, col in enumerate(df.select_dtypes(include=['float64','int64']).columns):
    plt.rcParams['axes.facecolor']='white'
    ax=plt.subplot(5,2,i+1)
    sns.histplot(data=df,x=col,ax=ax,color='red', kde=True)
    plt.suptitle('Data distribution of continuous variables')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(18,18))
for i, col in enumerate(df.select_dtypes(include=['float64','int64']).columns):
    plt.rcParams['axes.facecolor']='white'
    ax=plt.subplot(5,2,i+1)
    sns.boxplot(data=df,x=col,ax=ax,color='blue')
    plt.suptitle('Box plot of continuous variables')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.select_dtypes(include=['float']).corr(),annot=True,center=0)
plt.show()

In [None]:
df=pd.get_dummies(df,columns=['Holiday','Seasons','Functioning Day','Hour'],
                  drop_first=True)

In [None]:
X=df.iloc[:,2:]
y=df.iloc[:,1]

In [None]:
# Featured selection
# feature selection methods are intended to reduce the number of input variables to those that are 
# believed to be most useful to a model in order to predict the target variable

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [None]:
fs = SelectKBest(score_func=f_regression,k='all')
fs.fit(X,y)

In [None]:
feature_contribution=(fs.scores_/sum(fs.scores_))*100

In [None]:
#feature importance graph

In [None]:
for i,j in enumerate(X.columns):
    print(f'{j}:{feature_contribution[i]:.2f}%')
plt.figure(figsize=(12,6))
sns.barplot(x=X.columns,y=fs.scores_)
plt.show()

In [None]:
# From the above bar graph we can see the feature importance and we will include only those features 
# which are more important for our model.

## Splitting our data set into train and test set

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

# importing different regression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,StackingRegressor
from sklearn.model_selection import cross_val_score

In [None]:
model_comparison={}

In [None]:
estimators=[('linear regression',LinearRegression()),('Decision Tree',DecisionTreeRegressor()),
            ('random forest',RandomForestRegressor(n_estimators=10,random_state=0)),
            ('bagging',BaggingRegressor(RandomForestRegressor(n_estimators=10,random_state=0),random_state=0))]

In [None]:
models={
    "LinearRegression":LinearRegression(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(n_estimators=10,random_state=0),
    "BaggingRegressor":BaggingRegressor(RandomForestRegressor(n_estimators=10,random_state=0),random_state=0),
    "StackingRegressor":StackingRegressor(estimators=estimators,final_estimator=LinearRegression(),passthrough=True)
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)#train model
    
    y_pred=model.predict(X_test)
    
    print(list(models.keys())[i])
    print(f"Model R-Square : {r2_score(y_test,y_pred)*100:.2f}%")
    print(f"Model MSE : {mean_squared_error(y_test,y_pred)*100:.2f}%")
    accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 5)
    print("Cross Val Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Cross Val Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    model_comparison[i]=[r2_score(y_test,y_pred),mean_squared_error(y_test,y_pred),(accuracies.mean()),(accuracies.std())]
    print("----------------------------------------------------------------------------------------------")
    print('\n')

In [None]:
#hyperparameter tuning
rf_params={
    "max_depth":[5,8,15,None,10],
    "max_features":[5,7,'auto',10],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000]
}

In [None]:
rf_params

In [None]:
#models lists for hyperparameter tuning
randomcv_models=[
    ("RF",RandomForestRegressor(),rf_params)
]

In [None]:
randomcv_models

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_params={}

for name,model, params in randomcv_models:
    random=RandomizedSearchCV(estimator=model,
                             param_distributions=params,
                             n_iter=100,
                             cv=3,
                             verbose=2,
                             n_jobs=-1)
    random.fit(X_train,y_train)
    model_params[name]=random.best_params_
    
for model_name in model_params:
    print(f"-----------Best params for {model_name}---------")
    print(model_params[model_name])

In [None]:
#Fitting 3 folds for each of 100 candidates, totalling 300 fits
# -----------Best params for RF---------
# {'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 10, 'max_depth': None}


In [None]:
best_models={

    "RandomForestRegressor":RandomForestRegressor(n_estimators=1000, min_samples_split=2, max_features=10, max_depth=None),
   
}
for i in range(len(list(best_models))):
    bestmodel=list(best_models.values())[i]
    bestmodel.fit(X_train,y_train)#train model
    
    y_pred=bestmodel.predict(X_test)
    
    print(list(best_models.keys())[i])
    print(f"Model R-Square : {r2_score(y_test,y_pred)*100:.2f}%")
    print(f"Model MSE : {mean_squared_error(y_test,y_pred)*100:.2f}%")
    accuracies = cross_val_score(estimator = bestmodel, X = X_train, y = y_train, cv = 5)
    print("Cross Val Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Cross Val Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    model_comparison[i]=[r2_score(y_test,y_pred),mean_squared_error(y_test,y_pred),(accuracies.mean()),(accuracies.std())]
    print("----------------------------------------------------------------------------------------------")
    print('\n')

In [None]:
import pickle
import os
dir=f"D:\ML Projects 2080\seoul house prediction"
model_file_name="Seoul_bike_sharing_model_v1.pkl"
model_file_path=os.path.join(dir,model_file_name)

In [None]:
pickle.dump(bestmodel,open(model_file_path,"wb"))

In [None]:
X_test[0,:]

In [None]:
y_test