In [None]:
# requried libs
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("day.csv")

1. DATA Exploration

In [None]:
data.head(6)

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.info()

In [4]:
# asigning string to numeric values
data.loc[data['season'] == 1, 'season'] = 'spring'
data.loc[data['season'] == 2, 'season'] = 'summer'
data.loc[data['season'] == 3, 'season'] = 'fall'
data.loc[data['season'] == 4, 'season'] = 'winter'

In [7]:
data['season'].astype('category').value_counts()

season
fall      188
summer    184
spring    180
winter    178
Name: count, dtype: int64

In [None]:
def chg_mnth(x):
    return x.map({1: 'jan', 2:'feb', 3:'mar', 4:'apr', 5:'may', 6:'jun', 7:'jul', 8:'aug', 9:'sep', 10:'oct', 11:'nov', 12:'dec'})

In [None]:
data[['mnth']] = data[['mnth']].apply(chg_mnth)

In [None]:
def chg_weekday(x):
    return x.map({1: 'Mon', 2:'Tue', 3:'wed', 4:'Thu', 5:'Fri', 6:'Sat', 0:'Sun'})

In [None]:
data[['weekday']] = data[['weekday']].apply(chg_weekday)

In [None]:
data['holiday'].astype('category').value_counts()

In [None]:
data.loc[data['weathersit'] == 1, 'weathersit'] = 'A'
data.loc[data['weathersit'] == 2, 'weathersit'] = 'B'
data.loc[data['weathersit'] == 3, 'weathersit'] = 'C'

In [None]:
data['weathersit'].astype('category').value_counts()

2. Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.distplot(data['temp'])

In [None]:
sns.distplot(data['atemp'])

In [None]:
sns.distplot(data['windspeed'])

In [None]:
sns.distplot(data['cnt'])

In [None]:
# convert date into datetime format
data['dteday'] = data['dteday'].astype('datetime64[s]')

In [None]:
data.info()

In [None]:
data_catageries = data.select_dtypes(exclude=['float64', 'datetime64', 'int64'])

In [None]:
data_catageries

In [None]:
plt.figure(figsize=(20,20))
plt.subplot(3, 3, 1)
sns.boxplot(x='yr', y='cnt', data=data)
plt.subplot(3, 3, 2)
sns.boxplot(x='mnth', y='cnt', data=data)
plt.subplot(3, 3, 3)
sns.boxplot(x='weekday', y='cnt', data=data)
plt.subplot(3, 3, 4)
sns.boxplot(x='workingday', y='cnt', data=data)
plt.subplot(3, 3, 5)
sns.boxplot(x='season', y='cnt', data=data)
plt.subplot(3, 3, 6)
sns.boxplot(x='holiday', y='cnt', data=data)
plt.subplot(3, 3, 7)
sns.boxplot(x='weathersit', y='cnt', data=data)
plt.show()

In [None]:
data.info()

In [None]:
intvar = ['casual', 'registered', 'cnt']
for var in intvar:
    data[var] = data[var].astype('float64')

In [None]:
data_numeric = data.select_dtypes('float64')
data_numeric.head()

In [None]:
# pairplot are graphs with each other 
sns.pairplot(data_numeric)
plt.show()

In [None]:
corr = data_numeric.corr()
corr

In [None]:
# heapmap of data_numeric
mask = np.array(corr)
mask[np.tril_indices_from(mask)] = False

fig, ax = plt.subplots()
fig.set_size_inches(10, 10)

sns.heatmap(corr, mask=mask, vmax=1, square=True, annot=True)

In [None]:
data.drop('atemp', axis=1, inplace=True)

In [None]:
data.info()

3. DATA Preparation 

In [None]:
data.info()

In [None]:
data_catageries = data.select_dtypes(include=['object'])
data_catageries

In [None]:
data_dummy = pd.get_dummies(data_catageries, drop_first= True, dtype='int')

In [None]:
data_dummy.head()

In [None]:
# making a set of data other than catageries for the modal
data = data.drop(list(data_catageries.columns)+['instant', 'dteday'], axis=1)

In [None]:
fdata = pd.concat([data, data_dummy], axis=1)

In [None]:
fdata.head()

4. Model Building & Evaluating

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
#split train and test datasets
np.random.seed(0)
df_train, df_test = train_test_split(fdata, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
df_train

In [None]:
# applying scalar to columns
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
var = ['temp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']
df_train[var] = scalar.fit_transform(df_train[var])

In [None]:
plt.figure(figsize= (30, 30))    
sns.heatmap(df_train.corr(), annot=True, cmap="YlGnBu")
plt.show()

In [None]:
x_train = df_train.drop(['casual', 'registered'], axis = 1)
y_train = df_train.pop('cnt')

In [None]:
# Training data 
import statsmodels.api as sm
x_train_lm = sm.add_constant(x_train)

lr = sm.OLS(y_train,x_train_lm).fit()

In [None]:
lr.params

In [None]:
LR = LinearRegression()

LR.fit(x_train, y_train)

In [None]:
print(LR.coef_)
print(LR.intercept_)

In [None]:
lr.summary()

In [None]:
# removisng some features from the data which has less influence
from sklearn.feature_selection import RFE 

rfe1 = RFE(estimator=LR, n_features_to_select=15)
rfe1.fit(x_train, y_train)
print(rfe1.support_)
print(rfe1.ranking_)

In [None]:
col1 = x_train.columns[rfe1.support_]

In [None]:
col1

In [None]:
x_train_rfe1 = x_train[col1]
x_train_rfe1 = sm.add_constant(x_train_rfe1)
lm = sm.OLS(y_train,x_train_lm).fit()
lm.summary()

In [None]:
# evaluating VIFs
from statsmodels.stats.outliers_influence import variance_inflation_factor

a = x_train_rfe1.drop('const', axis=1)

In [None]:
vif = pd.DataFrame()
vif['features'] = a.columns
vif['VIF'] = [ variance_inflation_factor(a.values, i) for i in range(a.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by= 'VIF', ascending=False)
vif

In [None]:
lm = LinearRegression()
rfe2 = RFE(estimator=LR, n_features_to_select=7)

# fit with 7 features
rfe2.fit(x_train, y_train)
print(rfe2.support_)
print(rfe2.ranking_)

In [None]:
col2 = x_train.columns[rfe2.support_]

x_train_rfe2 = x_train[col2]
x_train_rfe2 = sm.add_constant(x_train_rfe2)
lm2 = sm.OLS(y_train,x_train_rfe2).fit()
lm2.summary()

In [None]:
b = x_train_rfe2.drop('const', axis=1)
vif1 = pd.DataFrame()
vif1['features'] = b.columns
vif1['VIF'] = [ variance_inflation_factor(b.values, i) for i in range(b.shape[1])]
vif1['VIF'] = round(vif1['VIF'], 2)
vif1 = vif1.sort_values(by= 'VIF', ascending=False)
vif1

In [None]:
df_test[var] =scalar.transform(df_test[var])
df_test

In [None]:
x_test = df_test.drop(['casual', 'registered'], axis = 1)
y_test = df_test.pop('cnt')

In [None]:
x_test

In [None]:
c = x_train_rfe2.drop('const', axis=1)
c.columns

In [None]:
x_test_rfe2 = x_test[col2]
x_test_rfe2 = sm.add_constant(x_test_rfe2)
x_test_rfe2.info()

In [None]:
y_pred = lm2.predict(x_test_rfe2)

In [None]:
plt.figure()
plt.scatter(y_test, y_pred)

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_test, y_pred)

In [None]:
plt.figure(figsize=(8,5))

sns.heatmap(data[col2].corr(), cmap='YlGnBu', annot=True)
plt.show()