# SeoulBike Rental Prediction

## Importing Required Libraries

In [1]:
import pandas as pd
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor

## Reading CSV data file

In [None]:
dataset = pd.read_csv('data/SeoulBikeData.csv', encoding='unicode_escape')

## Encoding Categorical Columns

In [None]:
label_encoder = LabelEncoder()

dataset['Seasons'] = label_encoder.fit_transform(dataset['Seasons'])
dataset['Holiday'] = label_encoder.fit_transform(dataset['Holiday'])
dataset['Functioning Day'] = label_encoder.fit_transform(dataset['Functioning Day'])

## Grouping Data on Per Day Basis

In [None]:
# Present data in file is on hourly basis. But, we need to predict on daily basis. Therefore, grouping the rows on the basis of date.

dataset_per_day = dataset.groupby("Date").agg({'Rented Bike Count': sum, 'Temperature(°C)': 'mean', 'Wind speed (m/s)': 'mean', 'Visibility (10m)': 'mean',
                            'Dew point temperature(°C)': 'mean', 'Solar Radiation (MJ/m2)': 'mean', 'Rainfall(mm)': 'mean', 'Snowfall (cm)': 'mean',
                            'Seasons': 'mean', 'Holiday': 'mean', 'Functioning Day': 'mean'})

In [None]:
# Resetting index
dataset_per_day = dataset_per_day.reset_index()

## Converting Date Column to 'year', 'month' and 'day'

In [None]:
dataset_per_day['Date'] = pd.to_datetime(dataset_per_day['Date'])   # As 'Date' column is in string format. Coverting it to datetime.
dataset_per_day['year']= dataset_per_day['Date'].dt.year            # Extracting year.
dataset_per_day['month']= dataset_per_day['Date'].dt.month          # Extracting month number.
dataset_per_day['day']= dataset_per_day['Date'].dt.day              # Extracting day of the month.

dataset_per_day = dataset_per_day.drop(['Date'], axis = 1)          # Dropping 'Date' column.

## Checking Multicollinearity using VIF (Variable Inflation Factors)

In [3]:
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [4]:
X = dataset_per_day.iloc[:,:-1]
calc_vif(X)

Unnamed: 0,variables,VIF
0,Rented Bike Count,21.534565
1,Temperature(°C),92.995027
2,Wind speed (m/s),12.111444
3,Visibility (10m),15.63942
4,Dew point temperature(°C),38.985307
5,Solar Radiation (MJ/m2),16.004387
6,Rainfall(mm),1.809374
7,Snowfall (cm),1.196113
8,Seasons,4.369128
9,Holiday,20.900324


In [5]:
# Dropping 'Temperature(°C)' and 'year' as they have large VIF

dataset_per_day = dataset_per_day.drop(['Temperature(°C)', 'year'], axis = 1)

In [6]:
dataset_per_day

Unnamed: 0,Rented Bike Count,Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,month,day
0,4290,1.454167,1894.833333,-13.866667,0.255833,0.000000,0.000000,3,0,1.0,1,1
1,5377,1.608333,1923.625000,-15.012500,0.337083,0.000000,0.904167,3,1,1.0,1,2
2,5132,3.554167,1084.000000,-6.387500,0.747500,0.104167,0.000000,1,0,1.0,1,3
3,17388,1.570833,831.833333,9.370833,0.302083,0.000000,0.000000,1,1,1.0,1,4
4,26820,1.437500,456.458333,15.120833,0.517500,0.000000,0.000000,1,0,1.0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
360,31681,1.941667,1246.208333,11.491667,1.122500,0.000000,0.000000,1,1,1.0,5,31
361,22897,1.383333,1993.791667,20.216667,1.020833,0.000000,0.000000,2,1,1.0,7,31
362,27817,1.600000,1609.000000,18.491667,0.923750,0.004167,0.000000,2,1,1.0,8,31
363,21545,1.666667,1960.916667,-1.408333,0.604583,0.000000,0.000000,0,1,1.0,10,31


## Defining Predictor and Target Variable

In [7]:
target = "Rented Bike Count"

X = dataset_per_day.drop(columns=target)
y = dataset_per_day[target]

## Splitting the Dataset in Train and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

## Applying Ordinary Least Squares(OLS) Method

In [8]:
## Applying OLS method from statsmodel.api to get matrices like 'R-squared', 'Adj. R-squared', 'p-value' etc.

X_train_sm = X_train
X_train_sm = sm.add_constant(X_train_sm)
lm = sm.OLS(y_train, X_train_sm).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:      Rented Bike Count   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     108.2
Date:                Tue, 16 Nov 2021   Prob (F-statistic):           5.47e-94
Time:                        20:15:09   Log-Likelihood:                -2865.5
No. Observations:                 292   AIC:                             5755.
Df Residuals:                     280   BIC:                             5799.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

## Defining Linear Regression and Fitting the Data

In [9]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predictions(Linear Regression)

In [10]:
y_pred_test_lr = lr_model.predict(X_test)
print('Linear Regression - Training Metrics')
print('MAE:', mean_absolute_error(y_test, y_pred_test_lr))
print('MSE:', mean_squared_error(y_test, y_pred_test_lr))
print('R2:', r2_score(y_test, y_pred_test_lr))

Linear Regression - Training Metrics
MAE: 2979.193633146087
MSE: 14410051.710787022
R2: 0.872979925496069


## Defining Random Forest Regressor and Fitting the Data

In [13]:
rf_regressor = RandomForestRegressor(n_estimators = 50, random_state = 42)
rf_regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

## Predictions(Random Forest Regressor)

In [14]:
y_pred_test_rf = rf_regressor.predict(X_test)
print('Random Forest - Training Metrics')
print('MAE:', mean_absolute_error(y_test, y_pred_test_rf))
print('MSE:', mean_squared_error(y_test, y_pred_test_rf))
print('R2:', r2_score(y_test, y_pred_test_rf))

Random Forest - Training Metrics
MAE: 2691.80794520548
MSE: 12118023.55552329
R2: 0.8931834329428029
