In [None]:
#Reading and Understanding the data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#libraries to build the model
import statsmodels
import statsmodels.api as sm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

## STEPS TO PERFORM MULTIPLE LINEAR REGRESSION

1) Reading and Understanding the data 
2) Cleaning the data
3) Visualizing the data using EDA
4) Preparing the data for modelling(train-test split, rescaling)
5) Training the model
6) Residual Analysis
7) Predictions and Evaluations on test set

## Step 1: Reading and Understanding the Data

In [None]:
bikesharing = pd.read_csv("day.csv")
bikesharing.head()

In [None]:
bikesharing.shape

In [None]:
#Target variable(cnt) is a continuous variable and from predictor variables, some are continuous & some are categorical

In [None]:
bikesharing.info()
#In the output we can see that there are no missing values. In case there are any, we can perform imputation on the same 

In [None]:
#Identify coloumns with all values as Zero and Remove them
zero_coloumns = bikesharing.columns[bikesharing.eq(0).all()]
zero_coloumns

In [None]:
#Identify coloumns with all values as same
Same_valuecols = bikesharing.columns[bikesharing.nunique()==1]
Same_valuecols

In [None]:
bikesharing.describe()

In [None]:
#Display all the coloumns
bikesharing.columns

#### Quick Data Summary:

    - No Null values in the data
    - 730 rows, 16 columns
    - No columns with all same values
    - Target variable(cnt) is a continuous variable and from predictor variables, some are continuous & some are categorical

## Step 2: Cleaning the dataset. 
        - Applying imputations and dropping irrelevant columns

In [None]:
bikesharing.head()

In [None]:
#Dropping below columns:
    # - dteday since we wont do day-to day analysis here. ALso, we already have month, year and weekday columns separately
    # - instant since it is just an index and nothing else
    # - casual and registered since we have cnt as the target variable which is a sum of casual and registered
    
bikesharing.drop(['registered', 'casual', 'dteday', 'instant'], axis=1, inplace= True)

In [None]:
bikesharing.head()

In [None]:
bikesharing.shape

In [None]:
bikesharing.info()

In [None]:
#Categorical variables season,mnth,weekday,weathersit are integer values. 
    # - But they dont follow any order and should be converted to String

In [None]:
bikesharing.season = bikesharing.season.replace({1: "spring", 2: "summer", 3: "fall", 4: "winter"})
bikesharing.mnth   = bikesharing.mnth.replace({1: 'jan',2: 'feb',3: 'mar',4: 'apr',5: 'may',6: 'jun',
                  7: 'jul',8: 'aug',9: 'sept',10: 'oct',11: 'nov',12: 'dec'})
bikesharing.weekday = bikesharing.weekday.replace({0: 'sun',1: 'mon',2: 'tue',3: 'wed',4: 'thu',5: 'fri',6: 'sat'})
bikesharing.weathersit = bikesharing.weathersit.replace({1: 'good',2: 'moderate',3: 'bad',4: 'severe'})

bikesharing.head(20)

In [None]:
bikesharing.info()

## Step 3: Reading and Visualizing the data

In [None]:
#Visualizing the data
#Linear Relationship and multicollinearity check

In [None]:
#Visualizing the numeric variables
sns.pairplot(bikesharing)
plt.show()

#### From the pairplots, We can see that there are atleast some numeric variables which have a linear relationship
Hence we should consider a Linear relationship

Additionally below are the inferences:
    - cnt and temp are highly correlated
    - cnt and atemp are highly correlated

In [None]:
bikesharing.head()

In [None]:
#Visualising the categorical variables
plt.figure(figsize=(20,12))
plt.subplot(2,4,1)
sns.boxplot(x='season',y='cnt',data=bikesharing)
plt.subplot(2,4,2)
sns.boxplot(x='yr',y='cnt',data=bikesharing)
plt.subplot(2,4,3)
sns.boxplot(x='mnth',y='cnt',data=bikesharing)
plt.subplot(2,4,4)
sns.boxplot(x='holiday',y='cnt',data=bikesharing)
plt.subplot(2,4,5)
sns.boxplot(x='weekday',y='cnt',data=bikesharing)
plt.subplot(2,4,6)
sns.boxplot(x='workingday',y='cnt',data=bikesharing)
plt.subplot(2,4,7)
sns.boxplot(x='weathersit',y='cnt',data=bikesharing)
plt.show()

### From the boxplots, we can see that:
    -  count of total rental bikes is higher during the fall season and relatively low in spring season
    -  count of total rental bikes increases significantly in the year 2019 from 2018
    -  count of total rental bikes increases somewhat linearly in first half of the year and then it starts decreasing
    -  count of total rental bikes is higher on days(other than holidays), probably because people commute to office
    -  count of total rental bikes is higher during clear weather and dips during rainy days

### Step 2: Preparing the dataset for Modelling
    - Encoding: 
    - Converting Binary Variables to 0/1 
    - Other Categorical variables to dummy variables
    - Splitting the data into train and test  
    - Rescaling of variables

In [None]:
#Creating a heatmap
plt.figure(figsize = (16,10))
sns.heatmap(bikesharing.corr(),annot=True,cmap="YlGnBu")
plt.show()

In [None]:
print(bikesharing.season.value_counts())

In [None]:
print(bikesharing.yr.value_counts())

In [None]:
#From the Heatmap, we can see that count of total rental bikes is highly correlated with temp and atemp
    # - However, we need to do the encoding and scaling first

#### Dummy Variable Encoding

In [None]:
bikesharing.head(10)

In [None]:
bikesharing.shape

In [None]:
status = pd.get_dummies(bikesharing["season"])
status.head()

In [None]:
#Dropping the redundant dummy variable
status1 = pd.get_dummies(bikesharing["season"],drop_first='True')
status1.head()

In [None]:
status2 = pd.get_dummies(bikesharing["mnth"],drop_first='True')
status2.head()

In [None]:
status3 = pd.get_dummies(bikesharing["weekday"],drop_first='True')
status3.head()

In [None]:
status4 = pd.get_dummies(bikesharing["weathersit"])
status4.head()

In [None]:
# concat the dummy dataframe with original 'bikesharing' dataframe
bikesharing = pd.concat([bikesharing,status1,status2,status3,status4],axis=1)
bikesharing.head()

In [None]:
bikesharing.shape

In [None]:
bikesharing.describe()

In [None]:
#Print columns after creating dummies
bikesharing.columns

In [None]:
#Drop season,mnth, weekday,weathersit coloumns
bikesharing = bikesharing.drop('season', axis=1)
bikesharing.head()

In [None]:
bikesharing = bikesharing.drop('mnth', axis=1)
bikesharing.head()

In [None]:
bikesharing = bikesharing.drop('weekday', axis=1)
bikesharing.head()

In [None]:
import os
os.getcwd()