In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

train=pd.read_csv("train_bsd.csv")
test=pd.read_csv("test_bsd.csv")

print(train.shape)
print(test.shape)

datasets=[train,test]

submit=pd.read_csv("sampleSubmission.csv")
print(submit.shape)
submit.head()

for dataset in datasets:
    dataset["datetime"]=pd.to_datetime(dataset["datetime"])

(10886, 12)
(6493, 9)
(6493, 2)


In [49]:
for dataset in datasets:
    dataset["datetime-year"]=dataset["datetime"].dt.year
    dataset["datetime-hour"]=dataset["datetime"].dt.hour

for dataset in datasets:
    dataset["datetime-dayofweek"]=dataset["datetime"].dt.day_name()
    dataset[["datetime-dayofweek"]]

for dataset in datasets:
    dataset["datetime-dayofweek_Sun"]=dataset["datetime-dayofweek"]=="Sunday"
    dataset["datetime-dayofweek_Mon"]=dataset["datetime-dayofweek"]=="Monday"
    dataset["datetime-dayofweek_Tue"]=dataset["datetime-dayofweek"]=="Tuesday"
    dataset["datetime-dayofweek_Wed"]=dataset["datetime-dayofweek"]=="Wednesday"
    dataset["datetime-dayofweek_Thu"]=dataset["datetime-dayofweek"]=="Thursday"
    dataset["datetime-dayofweek_Fri"]=dataset["datetime-dayofweek"]=="Friday"
    dataset["datetime-dayofweek_Sat"]=dataset["datetime-dayofweek"]=="Saturday"

In [50]:
for dataset in datasets:
    dataset.loc[dataset["weather"]==4,"weather"]=3

In [51]:
feature_names=["season","holiday","workingday","weather","temp","atemp","humidity","windspeed","datetime-year","datetime-hour","datetime-dayofweek_Mon","datetime-dayofweek_Tue","datetime-dayofweek_Wed","datetime-dayofweek_Thu","datetime-dayofweek_Fri","datetime-dayofweek_Sat","datetime-dayofweek_Sun"]
feature_names

['season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'datetime-year',
 'datetime-hour',
 'datetime-dayofweek_Mon',
 'datetime-dayofweek_Tue',
 'datetime-dayofweek_Wed',
 'datetime-dayofweek_Thu',
 'datetime-dayofweek_Fri',
 'datetime-dayofweek_Sat',
 'datetime-dayofweek_Sun']

In [52]:
x=train[feature_names]
y_log_casual=np.log(train["casual"]+1)
y_log_registered=np.log(train["registered"]+1)
y=train["count"]
x_test=test[feature_names]

In [53]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(random_state=37)
model

## model validation

In [54]:
from sklearn.model_selection import cross_val_predict
y_predict_log_casual=cross_val_predict(model,x,y_log_casual,cv=20)
y_predict_log_registered=cross_val_predict(model,x,y_log_registered,cv=20)
y_predict_casual=np.exp(y_predict_log_casual)-1
y_predict_registered=np.exp(y_predict_log_registered)-1
y_predict=y_predict_casual+y_predict_registered
y_predict

array([ 25.21074251,  22.05909323,  20.34299124, ..., 163.3236322 ,
       168.25032187, 113.32342497])

In [55]:
y

0         16
1         40
2         32
3         13
4          1
        ... 
10881    336
10882    241
10883    168
10884    129
10885     88
Name: count, Length: 10886, dtype: int64

In [57]:
score_mae=abs(y-y_predict).mean()
score_mse=((y-y_predict)**2).mean()
score_rmse=(score_mse)**0.5
f"score_mae = {score_mae:.1f}, score_mse = {score_mse:.1f}, score_rmse = {score_rmse:.1f}"

'score_mae = 32.4, score_mse = 2903.1, score_rmse = 53.9'

In [59]:
from sklearn.metrics import mean_absolute_error
score_mae1=mean_absolute_error(y,y_predict)
from sklearn.metrics import mean_squared_error
score_mse1=mean_squared_error(y,y_predict)
score_rmse1=np.sqrt(score_mse1)
from sklearn.metrics import mean_squared_log_error
score_msle=mean_squared_log_error(y,y_predict)
score_rmsle=np.sqrt(score_msle)
f"score(MAE) = {score_mae1:.1f}, score(MSE) = {score_mse1:.1f}, score(RMSE) = {score_rmse1:.1f}, score(RMSLE) = {score_rmsle:.5f}"

'score(MAE) = 32.4, score(MSE) = 2903.1, score(RMSE) = 53.9, score(RMSLE) = 0.35361'

## fit & predict & submit

In [60]:
model.fit(x,y_log_casual)
log_casual_predlist=model.predict(x_test)
model.fit(x,y_log_registered)
log_registered_predlist=model.predict(x_test)
casual_predlist=np.exp(log_casual_predlist)-1
registered_predlist=np.exp(log_registered_predlist)-1
predlist=casual_predlist+registered_predlist
predlist

array([ 11.21614845,   4.77576138,   2.68801493, ..., 100.94488838,
        95.1454443 ,  48.13814829])

In [61]:
submit["count"]=predlist
submit.to_csv("randfore.csv",index=False)