<a href="https://colab.research.google.com/github/Auraachn/Bike-Sharing-Demand/blob/main/Simple_Linear_Regression_BSD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Simple Linear Regression for Bike Sharing Demand

##Preparation

In [526]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_log_error,mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [527]:
# Correct URLs to raw data files
train_url = "https://raw.githubusercontent.com/Auraachn/Bike-Sharing-Demand/main/train.csv"
test_url = "https://raw.githubusercontent.com/Auraachn/Bike-Sharing-Demand/d42aa4e2c8671f293122940fa46ccfeba7894b51/test.csv"
sample_submission_url = "https://raw.githubusercontent.com/Auraachn/Bike-Sharing-Demand/main/sampleSubmission.csv"

# Read the CSV files into DataFrames
df_train = pd.read_csv(train_url, encoding='latin-1')
df_test = pd.read_csv(test_url, encoding='latin-1')
df_sample_submission = pd.read_csv(sample_submission_url, encoding='latin-1')

In [528]:
print('length of Train',len(df_train))
print('length of test',len(df_test))
print('length of gender submission',len(df_sample_submission))

length of Train 10886
length of test 6493
length of gender submission 6493


In [529]:
df_train.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

Splitting Data

In [530]:
x = df_train[['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered']]
y = df_train[['count']]

In [531]:
from sklearn.model_selection import train_test_split
x_Train, x_Validate, y_Train, y_Validate = train_test_split(x, y, test_size= 0.3, random_state= 0)

##Data Preprocessing

Convert date time collumn from object to datetime

In [532]:
df_train['datetime']=pd.to_datetime(df_train['datetime'])
df_train['Year'] = df_train['datetime'].dt.year
df_train['Month'] = df_train['datetime'].dt.month
df_train['Day'] = df_train['datetime'].dt.day
df_train['Hour'] = df_train['datetime'].dt.hour
df_train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,Year,Month,Day,Hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4


here, I choose to remove or drop casual and registered because its not exist in test set so does it will not give any importance to the prediction

In [533]:
df_train.drop(['datetime','casual','registered'],axis=1,inplace=True)

In [534]:
df_train = pd.get_dummies(df_train , columns = ['Hour'],prefix = ['Hour'] )

In [535]:
df_train = pd.get_dummies(df_train , columns = ['season'],prefix = ['season'] )

I choose not to handle the outliers to see the outcome

In [536]:
df_train['log_count']=np.log(df_train['count'])

Preparation of the validate set depend on the results of preparation of train set

In [537]:
df_validate=pd.concat([x_Validate , y_Validate] , axis=1)

In [538]:
df_validate.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
6638,2012-03-13 21:00:00,1,0,1,1,23.78,27.275,56,7.0015,44,200,244
7975,2012-06-12 16:00:00,2,0,1,2,27.06,29.545,89,19.0012,30,209,239
5915,2012-02-02 16:00:00,1,0,1,1,18.86,22.725,55,19.0012,18,211,229
8050,2012-06-15 19:00:00,2,0,1,1,28.7,31.82,42,11.0014,98,369,467
5894,2012-02-01 19:00:00,1,0,1,1,22.14,25.76,52,19.0012,20,315,335


In [539]:
df_validate['datetime']=pd.to_datetime(df_validate['datetime'])
df_validate['Year'] = df_validate['datetime'].dt.year
df_validate['Month'] = df_validate['datetime'].dt.month
df_validate['Day'] = df_validate['datetime'].dt.day
df_validate['Hour'] = df_validate['datetime'].dt.hour
df_validate.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,Year,Month,Day,Hour
6638,2012-03-13 21:00:00,1,0,1,1,23.78,27.275,56,7.0015,44,200,244,2012,3,13,21
7975,2012-06-12 16:00:00,2,0,1,2,27.06,29.545,89,19.0012,30,209,239,2012,6,12,16
5915,2012-02-02 16:00:00,1,0,1,1,18.86,22.725,55,19.0012,18,211,229,2012,2,2,16
8050,2012-06-15 19:00:00,2,0,1,1,28.7,31.82,42,11.0014,98,369,467,2012,6,15,19
5894,2012-02-01 19:00:00,1,0,1,1,22.14,25.76,52,19.0012,20,315,335,2012,2,1,19


In [540]:
df_validate.drop(['datetime','casual','registered'],axis=1,inplace=True)

In [541]:
df_validate = pd.get_dummies(df_validate , columns = ['Hour'],prefix = ['Hour'] )

In [542]:
df_validate = pd.get_dummies(df_validate , columns = ['season'],prefix = ['season'] )

In [543]:
df_validate['log_count']=np.log(df_validate['count'])

In [544]:
x_train=df_train[['season_1', 'season_2', 'season_3'
                  , 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'Year', 'Month', 'Day', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8',
       'Hour_9', 'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23']]
y_train=df_train[[ 'log_count' ]]

In [545]:
y_train.min()

log_count    0.0
dtype: float64

In [546]:
x_validate=df_validate[['season_1', 'season_2', 'season_3'
                  , 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'Year', 'Month', 'Day', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8',
       'Hour_9', 'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23']]
y_validate=df_validate[[ 'log_count' ]]

In [547]:
x_validate

Unnamed: 0,season_1,season_2,season_3,holiday,workingday,weather,temp,atemp,humidity,windspeed,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
6638,True,False,False,0,1,1,23.78,27.275,56,7.0015,...,False,False,False,False,False,False,False,True,False,False
7975,False,True,False,0,1,2,27.06,29.545,89,19.0012,...,False,False,True,False,False,False,False,False,False,False
5915,True,False,False,0,1,1,18.86,22.725,55,19.0012,...,False,False,True,False,False,False,False,False,False,False
8050,False,True,False,0,1,1,28.70,31.820,42,11.0014,...,False,False,False,False,False,True,False,False,False,False
5894,True,False,False,0,1,1,22.14,25.760,52,19.0012,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5521,True,False,False,0,1,2,8.20,11.365,59,6.0032,...,False,False,False,False,False,False,False,False,False,False
10626,False,False,False,0,0,2,16.40,20.455,94,7.0015,...,False,False,False,False,False,False,False,False,False,False
8126,False,True,False,0,1,3,24.60,28.790,78,12.9980,...,False,False,False,False,False,False,False,False,False,True
1633,False,True,False,0,1,1,16.40,20.455,71,8.9981,...,False,False,False,False,False,False,False,False,False,True


In [548]:
y_validate

Unnamed: 0,log_count
6638,5.497168
7975,5.476464
5915,5.433722
8050,6.146329
5894,5.814131
...,...
5521,1.609438
10626,2.484907
8126,4.304065
1633,4.127134


##Regression and evaluation Models

In [549]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_validate = sc.transform(x_validate)

In [550]:
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)
y_train_predict = linear_regression.predict(x_train)
y_validate_predict = linear_regression.predict(x_validate)
r2_train = r2_score(y_train, y_train_predict)
r2_vali = r2_score(y_validate, y_validate_predict)
print("r2 on trainset:",r2_train)
print("r2 on testset:",r2_vali)

train_msle = mean_squared_log_error(y_train, y_train_predict)
vali_msle = mean_squared_log_error(y_validate, y_validate_predict)
print(train_msle)
print(vali_msle)

r2 on trainset: 0.823504138998709
r2 on testset: 0.8207676854962006
0.02941859373249949
0.03149093565860882


In [551]:
print(y_validate_predict)

[[5.17123235]
 [5.84032886]
 [5.37083657]
 ...
 [4.52429231]
 [4.03991828]
 [4.82008515]]


In [552]:
y_train_predict.min()

0.2290285516874473

#Preprocess the test data

In [553]:
df_test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [554]:
df_test['datetime']=pd.to_datetime(df_test['datetime'])
df_test['Year'] = df_test['datetime'].dt.year
df_test['Month'] = df_test['datetime'].dt.month
df_test['Day'] = df_test['datetime'].dt.day
df_test['Hour'] = df_test['datetime'].dt.hour
df_test.drop(['datetime'],axis=1,inplace=True)

In [555]:
df_test = pd.get_dummies(df_test , columns = ['Hour'],prefix = ['Hour'] )

In [556]:
df_test = pd.get_dummies(df_test , columns = ['season'],prefix = ['season'])

In [557]:
df_test=df_test[['season_1', 'season_2', 'season_3'
                  , 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'Year', 'Month', 'Day', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8',
       'Hour_9', 'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23']]

In [558]:
sc = StandardScaler()
df_test = sc.fit_transform(df_test)

In [559]:
predictions = linear_regression.predict(df_test)
predictions = np.exp(predictions)

In [560]:
predictions

array([[ 15.1577724 ],
       [  9.13616985],
       [  5.25600857],
       ...,
       [194.33483026],
       [157.38584013],
       [102.54839896]])

In [562]:
predictions_fix = predictions.flatten()

In [563]:
submission = pd.DataFrame({
    'DateTime': df_sample_submission['datetime'],
    'Count': predictions_fix})

submission.to_csv('submission.csv', index=False)

In [564]:
print(submission.head())


              DateTime      Count
0  2011-01-20 00:00:00  15.157772
1  2011-01-20 01:00:00   9.136170
2  2011-01-20 02:00:00   5.256009
3  2011-01-20 03:00:00   2.923032
4  2011-01-20 04:00:00   2.055214
