# Bike Sharing Demand

# Information

This dataset has been acquired from Kaggle: https://www.kaggle.com/c/bike-sharing-demand/data

The analysis of this dataset has two phases. The first phase includes the exploratory data analysis and data visualization to gain insights into the dataset. In the second phase, i have used machine learning algorithms to predict the number of bake rentals in 2011 and 2012. At the end, i have checked my prediction by uploading it onto Kaggle where it avaluates how good my prediction was.

# Exploratory data analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor


from sklearn import set_config 
set_config(transform_output='pandas')

import matplotlib.font_manager as fm

In [None]:
df = pd.read_csv("data/trainingset_bikeshare.csv", index_col=0, parse_dates=True)
df.head()

In [None]:
df.tail()

In [None]:
df.isna().sum()

In [None]:
df['weather'].dtype

In [None]:
col = ['workingday', 'weather', 'count', 'holiday']

for i in col:
    max = df[i].max(),
    min = df[i].min()
    print(f"max and min for {i}: {max} and {min}")

In [None]:
df_corr = df.corr()

plt.figure(figsize = (10,8))

sns.heatmap(df_corr, annot = True)

plt.title('Correlation Heat Map')

In [None]:
sns.set(style= 'whitegrid')

plt.figure(figsize = (8, 6))

custom_palette = ["orange", "lightblue", "lightgreen", "violet"]

plot = sns.barplot(x = 'season', y = 'count',  data = df) #palette = custom_palette,  #palette='colorblind' for colorblind people

plt.title("Season's effect on bike rentals", weight = 'bold', fontsize =13)
plt.xlabel('Season', fontsize = 12)
plt.ylabel('Rental counts', fontsize = 12)

plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)

plt.setp(plot.get_yticklabels(), fontsize=12)
plt.setp(plot.get_xticklabels(), fontsize=12)


plt.grid(False)

In [None]:
# adding 'time' column to the data frame
df['time'] = df.index.time
df.head()

In [None]:
# adding 'year column to the data frame'

df['year'] = df.index.year

df.head()

In [None]:
# adding 'month' column to the data frame

df['month'] = df.index.month
df.tail()

In [None]:
# adding 'weekday' to the data frame

df['weekday'] = df.index.dayofweek
df.tail(10)

In [None]:
#sns.set(style= 'whitegrid')

plt.figure(figsize = (10, 8))

plot = sns.lineplot(x = 'temp', y = 'count', hue = 'year', data = df)

plt.title("Temperature's effect on bike rentals", weight = 'bold', fontsize =12)
plt.xlabel('Temperature', fontsize = 11)
plt.ylabel('Rental counts', fontsize = 11)

plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)

plt.setp(plot.get_yticklabels(), fontsize = 11)
plt.setp(plot.get_xticklabels(), fontsize = 11)


# Change legend -- font
legend = plt.legend(title='Year', loc='upper left', fontsize = 10)

import matplotlib.font_manager as fm

legend.set_title('Year', prop=fm.FontProperties(size=11))
legend.get_frame().set_linewidth(0)  # Remove legend frame border


# Change legend font properties
legend.texts[0].set_text('2011')  # Change legend label text
legend.texts[1].set_text('2012')  

plt.grid(False)

In [None]:
Dec2012 = df['2012-12-01': '2012-12-19']
Dec2012.tail()


In [None]:
Dec2012.shape

In [None]:
downsampled  = Dec2012[['casual', 'registered', 'count']].resample('D').mean().round(2)
downsampled.head()

In [None]:
downsampled  = Dec2012[['casual', 'registered', 'count']].resample('D').sum()
downsampled.head()

In [None]:
July2012 = df['2012-07-01':'2012-07-19']
July2012.head()

In [None]:
July_downsample = July2012[['casual', 'registered', 'count']].resample('D').sum().sort_values(by = 'count', ascending = True)
July_downsample.head()

In [None]:
group_df = df.groupby(['season', 'year'])
group_df = group_df['count'].agg('sum')
group_df.head()

In [None]:
group_df = df.groupby(['season', 'year'])['count'].sum().reset_index()
group_df.head()

In [None]:

custom_palette = ["orange", "lightgreen"]

plot =  sns.barplot(x = 'season', y = 'count', hue = 'year', data = group_df, palette = custom_palette, errorbar = None)
    
    
plot.spines['top'].set_visible(False)#remove
plot.spines['right'].set_visible(False);#remove

plt.title('Rental counts per year and season', weight = 'bold', fontsize = 11)
plt.xlabel('Season', fontsize = 11)
plt.ylabel('Rental count', fontsize = 11)

# Change legend color and font
legend = plt.legend(title='Year', loc='upper left', fontsize = 11)
legend.get_frame().set_linewidth(0)  # Remove legend frame border
legend.set_title('Year', prop=fm.FontProperties(size=14))

# Change legend font properties
legend.texts[0].set_text('2011')  # Change legend label text
legend.texts[1].set_text('2012')  


#ticklabels
plt.setp(plot.get_yticklabels(), fontsize = 11)
plt.setp(plot.get_xticklabels(), fontsize = 11)



plt.show()



In [None]:
#sns.set(style= 'whitegrid')

plt.figure(figsize = (10, 8))

plot = sns.lineplot(x = 'year', y = 'count', hue= 'season', data = df)

plt.title("Temperature's effect on bike rentals", weight = 'bold', fontsize =12)
plt.xlabel('Temperature', fontsize = 11)
plt.ylabel('Rental counts', fontsize = 11)

plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)

# Change legend color and font
legend = plt.legend(title='Season', loc='upper left', fontsize = 11)
legend.get_frame().set_linewidth(0)  # Remove legend frame border
legend.set_title('Season', prop=fm.FontProperties(size=11))

# Change legend font properties
legend.texts[0].set_text('1')  # Change legend label text
legend.texts[1].set_text('2')
legend.texts[2].set_text('3')
legend.texts[3].set_text('4')

#ticklabels
plt.setp(plot.get_yticklabels(), fontsize = 11)
plt.setp(plot.get_xticklabels(), fontsize = 11)

plt.grid(False)



In [None]:
plt.figure(figsize=(40,20))
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
plt.text(-60, 1600, "Comparison of climate factors by weather based on count  " , fontsize=14,style='oblique',alpha=0.6);#spice
axx=sns.lineplot(data=df, x="humidity", y="count",hue='season',ax=ax[0,0])
axx.spines['top'].set_visible(False)#remove
axx.spines['right'].set_visible(False);#remove
axx=sns.lineplot(data=df, x="windspeed", y="count",hue='season',ax=ax[0,1])
axx.spines['top'].set_visible(False)#remove
axx.spines['right'].set_visible(False);#remove
axx=sns.lineplot(data=df, x="atemp", y="count",hue='season',ax=ax[1,0])
axx.spines['top'].set_visible(False)#remove
axx.spines['right'].set_visible(False);#remove
axx=sns.lineplot(data=df, x="temp", y="count",hue='season',ax=ax[1,1])
axx.spines['top'].set_visible(False)#remove
axx.spines['right'].set_visible(False);#remove

In [None]:
group_newdf = df[['workingday', 'weather', 'count']].resample('H').sum()
group_newdf.head()

In [None]:
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time
df.head()

In [None]:
start_time = pd.to_datetime('13:00:00').time()

end_time = pd.to_datetime('20:00:00').time()

In [None]:
time_df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
time_df

In [None]:
group_time_df = time_df.groupby(['workingday', 'weather', 'time'])['count'].sum().reset_index()
group_time_df.head()

In [None]:
df[df['weather'] == 4]


In [None]:
# as seen above there is only 1 row with weather 4, which is similar in description to level 3, we replace it with 3
group_time_df.loc[group_time_df['weather'] == 4, 'weather'] = 3
# i also do this for the main data frame
df.loc[df['weather'] == 4, 'weather'] = 3

In [None]:
plt.figure(figsize = (8, 6))
color_palette = sns.color_palette("Blues_r")


plot = sns.barplot(x = 'workingday', y = 'count', hue= 'weather', data = group_time_df, palette=color_palette, errorbar = None)

plt.title('The rental counts between 1-8pm based on working day', weight = 'bold', fontsize =11)
plt.xlabel('Working day', fontsize =11)
plt.ylabel('Rental count', fontsize =11)

plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)

# Change legend color and font
legend = plt.legend(title='Weather', loc='upper right', fontsize = 11)
legend.get_frame().set_linewidth(0)  # Remove legend frame border
legend.set_title('Season', prop=fm.FontProperties(size=11))

# Change legend font properties
legend.texts[0].set_text('1')  # Change legend label text
legend.texts[1].set_text('2')  
legend.texts[2].set_text('3')


working_day_labels = ['Week day', 'Weekend & holiday']  # Custom labels for the levels
plot.set_xticklabels(working_day_labels)

plt.setp(plot.get_yticklabels(), fontsize=11)
plt.setp(plot.get_xticklabels(), fontsize=11)


# adding Year can also be informative

**It is great to add the year to the plot too**

# Machine learning models

In [None]:
df.head()

In [None]:
# we log transform the 'count' column
df['count'] = np.log1p(df['count'])

In [None]:
plt.figure(figsize=(8, 10)) #curve is now less scewed
sns.distplot(df['count'], bins = 60, color = "red")

In [None]:
numerical_features = ['temp', 'atemp', 'humidity', 'windspeed']

categorical_features = ['season', 'workingday', 'holiday', 'weather', 'time', 'year', 'month', 'weekday']

features = numerical_features + categorical_features

target_variable = 'count'

In [None]:
x, y = df[features], df[target_variable]

In [None]:
x.shape

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x,y, random_state=42)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

In [None]:
column_transform = ColumnTransformer(
    [("encoder", OneHotEncoder(handle_unknown = "ignore", sparse = False, drop = 'first'), categorical_features),
     ('scaling', MinMaxScaler(), numerical_features)
    ],
    remainder = 'passthrough'
)

In [None]:
x_train_transform = column_transform.fit_transform(x_train)
x_train_transform

In [None]:
# run the column transformation for the validation data set too so that they both have the similar and updated columns
x_val_transform = column_transform.transform(x_val)
x_val_transform

In [None]:
linear_reg = LinearRegression(fit_intercept=True)
linear_reg.fit(x_train_transform, y_train)

In [None]:
# training set score
training_score = linear_reg.score(x_train_transform, y_train)
print(f'The training score is: {round(training_score, 6)}')

In [None]:
# validation score
validation_score = linear_reg.score(x_val_transform, y_val)

print(f'The validation score is: {round(validation_score, 6)}')

## Grid estimator

In [None]:
estimator = Pipeline(
    steps=[
        ('preprocessor', column_transform),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [None]:
param_grid = {
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [None]:
from sklearn.model_selection import GridSearchCV

GS = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [None]:
import time
# initial time
ti = time.time()

# grid-search cross-validation
GS.fit(x_train,y_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

### Best parameters

In [None]:
print(GS.best_params_) # to get only the best parameter values that we searched for

### Best score


In [None]:
round(GS.best_score_,6)

### Instantiate best model

In [None]:
best_model_lasso = GS.best_estimator_ # the model with the best sets of hyperparameters
best_model_lasso

### Build model

In [None]:
best_model_lasso.fit(x_train,y_train);

### Model performance

In [None]:
# training score
training_score = best_model_lasso.score(x_train,y_train)

# test score
test_score = best_model_lasso.score(x_val,y_val)

print(f'Train score: {round(training_score,6)}')
print(f'Test score : {round(test_score,6)}')

## Random Forest

In [None]:
#RF_estimator = Pipeline(
#    steps=[
#        ('preprocessor', column_transform),   # preprocessing step
#        ('RF', RandomForestRegressor()) # random forest regression
#    ]
#)

In [None]:
#param_grid = {
#    'RF__n_estimators': [50,100,200,300,500],
#    'RF__max_depth': [5,10,20,None],
#    'RF__min_samples_split': [2, 5, 10]
#}

In [None]:
GS_RF = GridSearchCV(
    estimator=RF_estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [None]:
import time
# initial time
ti = time.time()

# grid-search cross-validation
GS_RF.fit(x_train,y_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

In [None]:
print(GS_RF.best_params_) # to get only the best parameter values that we searched for


In [None]:
print(GS_RF.best_score_,6)

In [None]:
best_model = GS_RF.best_estimator_ # the model with the best sets of hyperparameters
best_model

In [None]:
best_model.fit(x_train,y_train);

In [None]:
# training score
training_score = best_model.score(x_train,y_train)

# test score
test_score = best_model.score(x_val,y_val)

print(f'Train score: {round(training_score,6)}')
print(f'Test score : {round(test_score,6)}')


## Evaluation of the prediction on Kaggle

In [None]:
test_df = pd.read_csv('data/test.csv', index_col=0)
test_df.head()


In [None]:
test_df.isna().sum()

In [None]:
# to check whether level 4 exist which was the case in the training data that we replaced with 3
test_df[test_df['weather'] == 4]

In [None]:
# as in the training dataset, we replaced level 4 with 3 as there was only one row. Therefore, we do this in this test data too.
test_df.loc[test_df['weather'] == 4, 'weather'] = 3

In [None]:
# Convert the index to a DatetimeIndex
test_df.index = pd.DatetimeIndex(test_df.index)

# extracting info and add them as columns
test_df['time'] = test_df.index.time
test_df['year'] = test_df.index.year
test_df['month'] = test_df.index.month
test_df['weekday'] = test_df.index.dayofweek

test_df.head()

In [None]:
# convert the index column back to a column 
test_df.reset_index(inplace=True)
test_df.head()

In [None]:
numerical_features = ['temp', 'atemp', 'humidity', 'windspeed']

categorical_features = ['season', 'workingday', 'holiday', 'weather', 'time', 'year', 'month', 'weekday']

features = numerical_features + categorical_features

In [None]:
x = test_df[features]

In [None]:
predict = best_model_lasso.predict(x) # we use x here because in the gridsearchcv above we did the column transformation
predict

In [None]:
x_test_pred_exp = np.expm1(predict)
x_test_pred_exp

In [None]:
submission = pd.DataFrame({'datetime': test_df['datetime'], 'count': x_test_pred_exp})
submission.to_csv('submission.csv', index=False)