#Simple Linear Regression for Bike Sharing Demand

##Preparation

In [206]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_log_error,mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [207]:
train_df = pd.read_csv("train.csv", encoding='latin-1')
test_df = pd.read_csv("test.csv", encoding ='latin-1')
df_sample_submission=pd.read_csv("samplesubmission.csv", encoding ='latin-1')

In [208]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [209]:
test_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [210]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [211]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


##Preprocessing modeling

In [212]:
x = train_df.drop(['count'],axis=1,inplace=False)
y = train_df['count']

In [213]:
#Splitting train and test

X_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.2,random_state=0)

In [214]:
train_df['datetime']=pd.to_datetime(train_df['datetime'])

In [215]:
train_df['Year'] = train_df['datetime'].dt.year
train_df['Month'] = train_df['datetime'].dt.month
train_df['Day'] = train_df['datetime'].dt.day
train_df['Hour'] = train_df['datetime'].dt.hour

In [216]:
#dropping collumn that has no effecr on the target table
train_df.drop(['datetime','casual','registered'],axis=1,inplace=True)

In [217]:
train_df = pd.get_dummies(train_df , columns = ['Hour'],prefix = ['Hour'] )
train_df = pd.get_dummies(train_df , columns = ['season'],prefix = ['season'] )

This time i didn't handle outliers because I wanted to see the outcomes is without removing outliers

In [218]:
train_df['log_count']=np.log(train_df['count'])

In [219]:
df_test=pd.concat([x_test , y_test] , axis=1)
df_test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
6638,2012-03-13 21:00:00,1,0,1,1,23.78,27.275,56,7.0015,44,200,244
7975,2012-06-12 16:00:00,2,0,1,2,27.06,29.545,89,19.0012,30,209,239
5915,2012-02-02 16:00:00,1,0,1,1,18.86,22.725,55,19.0012,18,211,229
8050,2012-06-15 19:00:00,2,0,1,1,28.7,31.82,42,11.0014,98,369,467
5894,2012-02-01 19:00:00,1,0,1,1,22.14,25.76,52,19.0012,20,315,335


In [220]:
df_test['datetime']=pd.to_datetime(df_test['datetime'])

df_test['Year'] = df_test['datetime'].dt.year
df_test['Month'] = df_test['datetime'].dt.month
df_test['Day'] = df_test['datetime'].dt.day
df_test['Hour'] = df_test['datetime'].dt.hour

In [221]:
#dropping collumn that has no effecr on the target table
df_test.drop(['datetime','casual','registered'],axis=1,inplace=True)

In [222]:
df_test = pd.get_dummies(df_test , columns = ['Hour'],prefix = ['Hour'] )
df_test = pd.get_dummies(df_test , columns = ['season'],prefix = ['season'] )

This time i didn't handle outliers because I wanted to see the outcomes is without removing outliers

In [223]:
df_test['log_count']=np.log(df_test['count'])

In [224]:
#Setting up train data
train_df.columns

Index(['holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity',
       'windspeed', 'count', 'Year', 'Month', 'Day', 'Hour_0', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8',
       'Hour_9', 'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23', 'season_1', 'season_2', 'season_3',
       'season_4', 'log_count'],
      dtype='object')

In [225]:

y_train = train_df['log_count']
X_train = train_df.drop(['log_count', 'count'], axis=1, inplace=False)

In [226]:
df_test.columns

Index(['holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity',
       'windspeed', 'count', 'Year', 'Month', 'Day', 'Hour_0', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8',
       'Hour_9', 'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23', 'season_1', 'season_2', 'season_3',
       'season_4', 'log_count'],
      dtype='object')

In [227]:
y_df_test = df_test['log_count']
x_df_test = df_test.drop(['log_count', 'count'],axis=1,inplace=False)

In [228]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
x_df_test = ss.transform(x_df_test)

##regression and evaluation Models

###linear regression

In [229]:
model = LinearRegression()
model.fit(X_train,y_train)
y_train_predict = model.predict(X_train)
y_df_test_predict = model.predict(x_df_test)
r2_train = r2_score(y_train, y_train_predict)
r2_validate = r2_score(y_df_test, y_df_test_predict)
print("r2 on trainset :",r2_train)
print("r2 on testset  :",r2_validate)

train_msle = mean_squared_log_error(y_train, y_train_predict)
validate_msle = mean_squared_log_error(y_df_test, y_df_test_predict)

print(train_msle)
print(validate_msle)

r2 on trainset : 0.8234952643811674
r2 on testset  : 0.8190738453388956
0.02945987735030102
0.030553724166054085


In [230]:
y_train_predict.min()

0.23118169989366244

##preprocess the test data

In [231]:
test_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [232]:
#convert datetime from object to datetime64
test_df['datetime']=pd.to_datetime(test_df['datetime'])

#extract information like day, month and year, ect.... from column datetime
test_df['Year'] = test_df['datetime'].dt.year
test_df['Month'] = test_df['datetime'].dt.month
test_df['Day'] = test_df['datetime'].dt.day
test_df['Hour'] = test_df['datetime'].dt.hour

#drop datetime
test_df.drop(['datetime'],axis=1,inplace=True)
#one hot encoder
test_df = pd.get_dummies(test_df , columns = ['Hour'],prefix = ['Hour'] )
test_df = pd.get_dummies(test_df , columns = ['season'],prefix = ['season'])

In [233]:
test_df.columns

Index(['holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity',
       'windspeed', 'Year', 'Month', 'Day', 'Hour_0', 'Hour_1', 'Hour_2',
       'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8', 'Hour_9',
       'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14', 'Hour_15',
       'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20', 'Hour_21',
       'Hour_22', 'Hour_23', 'season_1', 'season_2', 'season_3', 'season_4'],
      dtype='object')

In [234]:
predictions = model.predict(test_df)
predictions = np.exp(predictions)

In [235]:
submission = pd.DataFrame({
    'DateTime': df_sample_submission['datetime'],
    'count': predictions
})

submission.to_csv('submission.csv', index=False)

In [238]:

submission_finish = pd.read_csv("submission.csv", encoding='latin-1')
submission_finish.head()

Unnamed: 0,DateTime,count
0,2011-01-20 00:00:00,inf
1,2011-01-20 01:00:00,inf
2,2011-01-20 02:00:00,inf
3,2011-01-20 03:00:00,inf
4,2011-01-20 04:00:00,inf
