#**Import Libraries, Models ...**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

#**1 - Read datasets**

In [None]:
data = pd.read_csv('hour.csv')
data

In [None]:
data.head(1)

#**Get column names of the dataset**



In [None]:
category_features = ['season', 'holiday', 'mnth', 'hr', 'weekday', 'workingday', 'weathersit']
number_features = ['temp', 'atemp', 'hum', 'windspeed']
data[number_features].describe()

#**Ensure all the types of variables before doing training, in the case there are qualitative values, the encoding is applied.**

In [None]:
data.info()

#**Check for null values**


In [None]:
data.isnull().sum()

#**PRE-PROCESSING PHASE**

#**Remove outliers from data**

#**Detecting outliers using Box plots**

In [None]:
fig, axes = plt.subplots(nrows=3,ncols=2)
fig.set_size_inches(15, 15)
sns.boxplot(data=data,y="cnt",orient="v",ax=axes[0][0])
sns.boxplot(data=data,y="cnt",x="mnth",orient="v",ax=axes[0][1])
sns.boxplot(data=data,y="cnt",x="weathersit",orient="v",ax=axes[1][0])
sns.boxplot(data=data,y="cnt",x="workingday",orient="v",ax=axes[1][1])
sns.boxplot(data=data,y="cnt",x="hr",orient="v",ax=axes[2][0])
sns.boxplot(data=data,y="cnt",x="temp",orient="v",ax=axes[2][1])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='Month', ylabel='Count',title="Box Plot On Count Across Months")
axes[1][0].set(xlabel='Weather Situation', ylabel='Count',title="Box Plot On Count Across Weather Situations")
axes[1][1].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count Across Working Day")
axes[2][0].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hour Of The Day")
axes[2][1].set(xlabel='Temperature', ylabel='Count',title="Box Plot On Count Across Temperature")

#**Remove outliers**

In [None]:
print("Samples in train set with outliers: {}".format(len(data)))
q1 = data.cnt.quantile(0.25)
q3 = data.cnt.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 -(1.5 * iqr) 
upper_bound = q3 +(1.5 * iqr) 
data_without_outlier = data.loc[(data.cnt >= lower_bound) & (data.cnt <= upper_bound)]
print("Samples in train set without outliers: {}".format(len(data_without_outlier)))

sns.distplot(data_without_outlier.cnt);


In [None]:
data = data_without_outlier

#**Correlation Analysis**

In [None]:
matrix = data[number_features + ['cnt']].corr()
heat = np.array(matrix)
heat[np.tril_indices_from(heat)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(matrix, mask=heat,vmax=1.0, vmin=0.0, square=True,annot=True, cmap="Reds")

#**Rename columns to make them more meaningful and readable**


In [None]:
data = data.rename(columns={'yr':'year',
                            'mnth':'month',
                            'weathersit':'weather',
                            'hr':'hour',
                            'hum':'humidity',
                            'cnt':'count'})
data.describe()

#**Remove features non necessary**


In [None]:
data = data.drop(columns=['instant','dteday','year','casual','registered','atemp'])
data.head()

#**Dataset normalization**

In [None]:
data = preprocessing.normalize(data)

In [None]:
data =pd.DataFrame(data,columns=['season','month','hour','holiday','weekday','workingday','weather','temp','humidity','windspeed','count'])
data.head()

#**Implementation**

In [None]:
data =pd.DataFrame(data,columns=['season','month','hour','holiday','weekday','workingday','weather','temp','humidity','windspeed','count'])
y = data['count'] # Outputs
data = data.drop(columns=['count']) # Remove the colon of outputs from datasets
print("Data shape : ", data.shape)
print("Target shape : ", y.shape)

In [None]:
#regler les demesions de y
y = np.array(y.values)
y =y.reshape(data.shape[0],1)
print(y.shape)


In [None]:
#Add the bias column for our dataset
data = np.hstack((data,np.ones([data.shape[0],1])))
data.shape

**Conclusion:** At the end of the descriptive analysis chapter, we can note the following points:

- Casual and registered contain direct information about the bike sharing count which is to predict (data leakage). Therefore they are not considered in the feature set.
- The variables "temp" and "atemp" are strongly correlated. To reduce the dimensionality of the predictive model, the feature "atemp" is dismissed.
The variables "hr" and "temp" seem to be promising features for the bike sharing count prediction.


#**MODEL BUILDING**

In [None]:
Matrice_Models = []

#**Split Dataset into TrainSets, TestSets** 

In [None]:
x_train,x_rest,y_train,y_rest = train_test_split(data,y,test_size = 0.40 ,random_state = 0)
x_valid,x_test,y_valid,y_test = train_test_split(data,y,test_size = 0.50 ,random_state = 0)

In [None]:
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0],1)
y_valid = np.array(y_valid)
y_valid = y_valid.reshape(y_valid.shape[0],1)
y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0],1)

# **Lineaire Regression**

In [None]:
Theta = np.random.randn(x_train.shape[1],1)

In [None]:
def model(X,theta):
  return X.dot(theta)

In [None]:
def cost_function(X,y,theta):
  m = len(y)
  return 1/2*m * np.sum((model(X,theta) - y)**2)

In [None]:
cost_function(x_train,y_train,Theta)

In [None]:
def grad(X,y,theta):
  m = len(y)
  return 1/m * X.T.dot(model(X,theta) - y)

In [None]:
def gradient_decent(X,y,theta, learning_rate, n_iteration):
  cost_history = np.zeros(n_iteration)
  for i in range(0,n_iteration):
    theta = theta - learning_rate * grad(X,y,theta)
    cost_history[i] = cost_function(X,y,theta)
  return theta, cost_history

In [None]:
def coef_determoination(y,pred):
  y = np.array(y)
  pred = np.array(pred)
  u = ((y- pred)**2).sum()
  v = ((y-y.mean())**2).sum()
  return 1 - u/v

In [None]:
print(x_train.shape)
print(Theta.shape)

In [None]:
Iterations = [2000, 1000, 1000]
LR = [0.7, 0.09, 0.3]
i = 0
for iteration, learningRate in zip(Iterations, LR):
  i = i+1
  Theta__Final, cost_history = gradient_decent(x_train,y_train,Theta,learning_rate=LR,n_iteration=Iterations)
  y_pred_train = model(x_train,Theta__Final)   
  y_pred_valid = model(x_valid,Theta__Final)
  y_pred_test = model(x_test,Theta__Final) 
  # Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
  print('Training coef_determoination ',i,':', coef_determoination(y_train, y_pred_train))
  print('Validation coef_determoination ',i,':', coef_determoination(y_valid, y_pred_valid))
  print('Test coef_determoination ',i,':', coef_determoination(y_test, y_pred_test))
  score_gradient_decent = coef_determoination(y_test, y_pred_test)*100


In [None]:
plt.plot(range(10000), cost_history)

#**Normal Equation in Linear Regression**

In [None]:
theta_best_values=np.linalg.inv(x_train.T.dot(x_train)).dot((x_train.T).dot(y_train))
print(x_train.shape)
print(x_train.T.shape)
print(x_train.T.dot(x_train).shape)
print(np.linalg.inv(x_train.T.dot(x_train)).shape)
print(x_train.T.dot(y_train).shape)
# Display best values obtained.
print(theta_best_values)

In [None]:
y_pred_train = model(x_train,theta_best_values)  
y_pred_valid = model(x_valid,theta_best_values)  
y_pred_test = model(x_test,theta_best_values)  

In [None]:
# Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
print('Training coef_determoination :', coef_determoination(y_train, y_pred_train))
print('Validation coef_determoination :', coef_determoination(y_valid, y_pred_valid))
print('Test coef_determoination :', coef_determoination(y_test, y_pred_test))
score_equation_normal = coef_determoination(y_test, y_pred_test)*100

********************************************************************************************

#**Polynomial regression using Sklearn**

In [None]:
polynomial_features = PolynomialFeatures(degree=3)

x_poly = polynomial_features.fit_transform(x_train)
x_poly_valid = polynomial_features.fit_transform(x_valid)
x_poly_test = polynomial_features.fit_transform(x_test)

LinearRegression = LinearRegression()
LinearRegression.fit(x_poly, y_train)
Y_pred = LinearRegression.predict(x_poly_test)
# Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
print('Training coef_determoination :', LinearRegression.score(x_poly,y_train))
print('Validation coef_determoination :', LinearRegression.score(x_poly_valid,y_valid))
print('Test coef_determoination :', LinearRegression.score(x_poly_test,y_test))
score_polynomial_regression = LinearRegression.score(x_poly_test,y_test)*100

In [None]:
from sklearn.linear_model import LinearRegression
polynomial_features = PolynomialFeatures(degree=5)

x_poly = polynomial_features.fit_transform(x_train)
x_poly_valid = polynomial_features.fit_transform(x_valid)
x_poly_test = polynomial_features.fit_transform(x_test)

LinearRegression = LinearRegression()
LinearRegression.fit(x_poly, y_train)
Y_pred = LinearRegression.predict(x_poly_test)
# Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
print('Training coef_determoination :', LinearRegression.score(x_poly,y_train))
print('Validation coef_determoination :', LinearRegression.score(x_poly_valid,y_valid))
print('Test coef_determoination :', LinearRegression.score(x_poly_test,y_test))
score_polynomial_regression = LinearRegression.score(x_poly_test,y_test)*100

********************************************************************************************

#**SVR**

In [None]:
modelsk = SVR(C=100)
modelsk.fit(x_train,y_train)
print('Training coef_determoination :', modelsk.score(x_train,y_train))
print('Validation coef_determoination :', modelsk.score(x_valid,y_valid))
print('Test coef_determoination :', modelsk.score(x_test,y_test))
score_svr = modelsk.score(x_test,y_test)*100

#**ANN**

In [None]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor((150,150,150), random_state=1, max_iter=100).fit(x_train, y_train)
print('Training coef_determoination :', regr.score(x_train, y_train))
print('Validation coef_determoination :', regr.score(x_valid, y_valid))
print('Test coef_determoination :', regr.score(x_test, y_test))
score_ann = regr.score(x_test,y_test)*100

In [None]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor((150), random_state=1, max_iter=100).fit(x_train, y_train)
print('Training coef_determoination :', regr.score(x_train, y_train))
print('Validation coef_determoination :', regr.score(x_valid, y_valid))
print('Test coef_determoination :', regr.score(x_test, y_test))
score_ann = regr.score(x_test,y_test)*100

In [None]:
Scores = [score_gradient_decent, score_equation_normal, score_polynomial_regression, score_svr, score_ann] 
Scores

In [None]:
import matplotlib.pyplot as plt 
  
# x-coordinates of left sides of bars  
x = [1, 2, 3, 4, 5]  
# heights of bars 
y = [score_gradient_decent, score_equation_normal, score_polynomial_regression, score_svr, score_ann] 
  
# labels for bars 
tick_label =  ["GD", "EN", "POL", "SVR", "ANN"] 
  
# plotting a bar chart 
plt.bar(x, y, tick_label = tick_label, 
        width = 0.5, color = ['black', 'black','black','black','red']) 
  
# naming the x-axis 
plt.xlabel('Models') 
# naming the y-axis 
plt.ylabel('Scores') 
# plot title 
plt.title('Models Comparaison') 
  
# function to show the plot 
plt.show()

#**Experience without Preprocessing phase**

In [None]:
dataNP = pd.read_csv('hour.csv')
dataNP

In [None]:
category_features = ['season', 'holiday', 'mnth', 'hr', 'weekday', 'workingday', 'weathersit']
number_features = ['temp', 'atemp', 'hum', 'windspeed']

In [None]:
print("Samples in train set with outliers: {}".format(len(dataNP)))
q1 = dataNP.cnt.quantile(0.25)
q3 = dataNP.cnt.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 -(1.5 * iqr) 
upper_bound = q3 +(1.5 * iqr) 
data_without_outlier = dataNP.loc[(dataNP.cnt >= lower_bound) & (dataNP.cnt <= upper_bound)]
print("Samples in train set without outliers: {}".format(len(data_without_outlier)))

sns.distplot(data_without_outlier.cnt);


In [None]:
dataNP = dataNP.rename(columns={'yr':'year',
                            'mnth':'month',
                            'weathersit':'weather',
                            'hr':'hour',
                            'hum':'humidity',
                            'cnt':'count'})

In [None]:
dataNP =pd.DataFrame(dataNP,columns=['season','month','hour','holiday','weekday','workingday','weather','temp','humidity','windspeed','count'])
dataNP.head()

In [None]:
dataNP = data_without_outlier

In [None]:
dataNP = np.array(dataNP)

In [None]:
yNP = dataNP[:,-1] # Outputs
dataNP = dataNP[:,:-2] # Remove the coloumn of outputs from datasets
print("Data shape : ", dataNP.shape)
print("Target shape : ", yNP.shape)

In [None]:
#regler les demesions de y
yNP = np.array(yNP)
yNP =yNP.reshape(dataNP.shape[0],1)
print(yNP.shape)

In [None]:
#Add the bias column for our dataset
dataNP = np.hstack((data,np.ones([data.shape[0],1])))
dataNP.shape

In [None]:
x_trainNP,x_restNP,y_trainNP,y_restNP = train_test_split(dataNP,yNP,test_size = 0.40 ,random_state = 0)
x_validNP,x_testNP,y_validNP,y_testNP = train_test_split(dataNP,yNP,test_size = 0.50 ,random_state = 0)

In [None]:
y_trainNP = np.array(y_trainNP)
y_trainNP = y_trainNP.reshape(y_trainNP.shape[0],1)
y_validNP = np.array(y_validNP)
y_validNP = y_validNP.reshape(y_validNP.shape[0],1)
y_testNP = np.array(y_testNP)
y_testNP = y_testNP.reshape(y_testNP.shape[0],1)

#**Lineaire Regression**

In [None]:
ThetaNP = np.random.randn(x_trainNP.shape[1],1)

In [None]:
cost_function(x_trainNP,y_trainNP,ThetaNP)

In [None]:
print(x_trainNP.shape)
print(ThetaNP.shape)

In [None]:
Theta__FinalNP, cost_historyNP = gradient_decent(x_trainNP,y_trainNP,ThetaNP,learning_rate=0.7,n_iteration=10000)

In [None]:
y_pred_trainNP = model(x_trainNP,Theta__FinalNP)   
y_pred_validNP = model(x_validNP,Theta__FinalNP)
y_pred_testNP = model(x_testNP,Theta__FinalNP) 

In [None]:
plt.plot(range(10000), cost_historyNP)

In [None]:
# Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
print('Training coef_determoination :', coef_determoination(y_trainNP, y_pred_trainNP))
print('Validation coef_determoination :', coef_determoination(y_validNP, y_pred_validNP))
print('Test coef_determoination :', coef_determoination(y_testNP, y_pred_testNP))
score_gradient_decentNP = coef_determoination(y_testNP, y_pred_testNP)*100

#**Polynomial regression using Sklearn**

In [None]:
from sklearn.linear_model import LinearRegression

polynomial_features = PolynomialFeatures(degree=3)
x_polyNP = polynomial_features.fit_transform(x_trainNP)
x_poly_validNP = polynomial_features.fit_transform(x_validNP)
x_poly_testNP = polynomial_features.fit_transform(x_testNP)

LinearRegressionNP = LinearRegression()
LinearRegressionNP.fit(x_polyNP, y_trainNP)
Y_predNP = LinearRegressionNP.predict(x_poly_testNP)
# Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
print('Training coef_determoination :', LinearRegressionNP.score(x_polyNP,y_trainNP))
print('Validation coef_determoination :', LinearRegressionNP.score(x_poly_validNP,y_validNP))
print('Test coef_determoination :', LinearRegressionNP.score(x_poly_testNP,y_testNP))
score_polynomial_regression = LinearRegressionNP.score(x_poly_testNP,y_testNP)*100

#**SVR**

In [None]:
modelsk = SVR(C=100)
modelsk.fit(x_trainNP,y_trainNP)
print('Training coef_determoination :', modelsk.score(x_trainNP,y_trainNP))
print('Validation coef_determoination :', modelsk.score(x_validNP,y_validNP))
print('Test coef_determoination :', modelsk.score(x_testNP,y_testNP))
score_svr = modelsk.score(x_testNP,y_testNP)*100

#**ANN**

In [None]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor((150,150,150), random_state=1, max_iter=100).fit(x_trainNP, y_trainNP)
print('Training coef_determoination :', regr.score(x_trainNP, y_trainNP))
print('Validation coef_determoination :', regr.score(x_validNP, y_validNP))
print('Test coef_determoination :', regr.score(x_testNP, y_testNP))
score_ann = regr.score(x_testNP,y_testNP)*100

#**Normal Equation in Linear Regression**

In [None]:
theta_best_valuesNP=np.linalg.inv(x_trainNP.T.dot(x_trainNP)).dot((x_trainNP.T).dot(y_trainNP))
print(x_trainNP.shape)
print(x_trainNP.T.shape)
print(x_trainNP.T.dot(x_trainNP).shape)
print(np.linalg.inv(x_trainNP.T.dot(x_trainNP)).shape)
print(x_trainNP.T.dot(y_trainNP).shape)
# Display best values obtained.
print(theta_best_valuesNP)
y_pred_trainNP = model(x_trainNP,theta_best_valuesNP)  
y_pred_validNP = model(x_validNP,theta_best_valuesNP)  
y_pred_testNP = model(x_testNP,theta_best_valuesNP)
# Calculation of Mean Squared Error (MSE)mean_squared_error(Y_true,Y_pred)
print('Training coef_determoination :', coef_determoination(y_trainNP, y_pred_trainNP))
print('Validation coef_determoination :', coef_determoination(y_validNP, y_pred_validNP))
print('Test coef_determoination :', coef_determoination(y_testNP, y_pred_testNP))
score_equation_normal = coef_determoination(y_testNP, y_pred_testNP)*100 