## Bike Sharing Pridicting model
### Imports

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

### Dataset from UCI Repository
https://archive.ics.uci.edu/dataset/560/seoul+bike+sharing+demand

In [28]:
df = pd.read_csv("seoul_bike_data/SeoulBikeData.csv", encoding="latin-1")
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


### Preprocessing Dataset

In [29]:
def processByLable(label, dataframe): 
    uniqueValues = dataframe[label].unique()
    valueToIndex = {value: index for index, value in enumerate(uniqueValues)}
    
    dataframe[label] = dataframe[label].map(valueToIndex)
    print(valueToIndex)

processByLable("Seasons", df)
processByLable("Holiday", df)
processByLable("Functioning Day", df)
df.head()

{'Winter': 0, 'Spring': 1, 'Summer': 2, 'Autumn': 3}
{'No Holiday': 0, 'Holiday': 1}
{'Yes': 0, 'No': 1}


Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,0,0
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,0,0
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,0,0
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,0,0
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,0,0


### Separating trainning dataset from validation and testing

In [30]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

  return bound(*args, **kwds)


### Scaling and oversampling

In [31]:
def scaleDataset(dataframe, oversample=False):
    x = dataframe[dataframe.columns[1:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    if oversample:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x, y)

    data = np.hstack((x, np.reshape(y, (-1, 1))))

    return data, x, y

In [32]:
train, x_train, y_train = scaleDataset(train)
valid, x_valid, y_valid = scaleDataset(valid)
test, x_test, y_test = scaleDataset(test)

### Gradient Decent

In [33]:
from sklearn.linear_model import SGDRegressor

GD_model = SGDRegressor()
GD_model = GD_model.fit(x_train, y_train)
y_predict = GD_model.predict(x_test)

In [34]:
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = root_mean_squared_error(y_test, y_predict)
r2s = r2_score(y_test, y_predict)

print(f"mean absolute error: {mae:.4f}\nmean squared error: {mse:.4f}\nroot mean squared error: {rmse:.4f}\nr(squared) score: {r2s:.4f}")

mean absolute error: 0.0818
mean squared error: 0.0281
root mean squared error: 0.1676
r(squared) score: 0.1637
