In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

train=pd.read_csv("train_bsd.csv")
test=pd.read_csv("test_bsd.csv")

print(train.shape)
print(test.shape)

datasets=[train,test]

submit=pd.read_csv("sampleSubmission.csv")
print(submit.shape)
submit.head()

for dataset in datasets:
    dataset["datetime"]=pd.to_datetime(dataset["datetime"])

(10886, 12)
(6493, 9)
(6493, 2)


In [9]:
for dataset in datasets:
    dataset["datetime-year"]=dataset["datetime"].dt.year
    dataset["datetime-month"]=dataset["datetime"].dt.month
    dataset["datetime-hour"]=dataset["datetime"].dt.hour

for dataset in datasets:
    dataset["datetime-dayofweek"]=dataset["datetime"].dt.day_name()
    dataset[["datetime-dayofweek"]]

for dataset in datasets:
    dataset["datetime-dayofweek_Sun"]=dataset["datetime-dayofweek"]=="Sunday"
    dataset["datetime-dayofweek_Mon"]=dataset["datetime-dayofweek"]=="Monday"
    dataset["datetime-dayofweek_Tue"]=dataset["datetime-dayofweek"]=="Tuesday"
    dataset["datetime-dayofweek_Wed"]=dataset["datetime-dayofweek"]=="Wednesday"
    dataset["datetime-dayofweek_Thu"]=dataset["datetime-dayofweek"]=="Thursday"
    dataset["datetime-dayofweek_Fri"]=dataset["datetime-dayofweek"]=="Friday"
    dataset["datetime-dayofweek_Sat"]=dataset["datetime-dayofweek"]=="Saturday"

In [10]:
for dataset in datasets:
    dataset.loc[dataset["weather"]==4,"weather"]=3

In [11]:
feature_names = [
    "season",
    "holiday",
    "workingday",
    "weather",
    "temp",
    "atemp",
    "humidity",
    "windspeed",
    "datetime-year",
    "datetime-hour",
    "datetime-dayofweek_Mon",
    "datetime-dayofweek_Tue",
    "datetime-dayofweek_Wed",
    "datetime-dayofweek_Thu",
    "datetime-dayofweek_Fri",
    "datetime-dayofweek_Sat",
    "datetime-dayofweek_Sun",
]
len(feature_names)

17

In [12]:
x=train[feature_names]
y_log_casual=np.log(train["casual"]+1)
y_log_registered=np.log(train["registered"]+1)
y=train["count"]
x_test=test[feature_names]

# Hyperparameters Tuning

In [13]:
n_estimators=300
num_loop=100

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_log_error

min_rmsle=-1
min_data=[]

for loop in range(num_loop):
    max_depth=np.random.randint(2,100)
    max_features=np.random.uniform(0.1,1.0)
    model=RandomForestRegressor(random_state=37,
                                max_depth=max_depth,
                                max_features=max_features,
                                n_jobs=-1,
                                n_estimators=n_estimators)
    y_predict_log_casual=cross_val_predict(model,x,y_log_casual,cv=20)
    y_predict_log_registered=cross_val_predict(model,x,y_log_registered,cv=20)
    y_predict_casual=np.exp(y_predict_log_casual)-1
    y_predict_registered=np.exp(y_predict_log_registered)-1
    y_predict=y_predict_casual+y_predict_registered

    score_msle2=mean_squared_log_error(y,y_predict)
    score_rmsle2=np.sqrt(score_msle2)

    if min_rmsle==-1 or min(score_rmsle2,min_rmsle)==score_rmsle2:
        min_rmsle=score_rmsle2
        min_data=[max_depth,max_features,n_estimators]

    print(f"loop {loop+1} - max_depth:{max_depth} max_features:{max_features:.6f} n_estimators:{n_estimators} score_rmsle2:{score_rmsle2:.5f}")

loop 1 - max_depth:79 max_features:0.506220 n_estimators:300 score_rmsle2:0.36126
loop 2 - max_depth:98 max_features:0.372022 n_estimators:300 score_rmsle2:0.38557
loop 3 - max_depth:13 max_features:0.519680 n_estimators:300 score_rmsle2:0.37066
loop 4 - max_depth:7 max_features:0.105041 n_estimators:300 score_rmsle2:0.92373
loop 5 - max_depth:95 max_features:0.629295 n_estimators:300 score_rmsle2:0.35327
loop 6 - max_depth:6 max_features:0.500718 n_estimators:300 score_rmsle2:0.55964
loop 7 - max_depth:32 max_features:0.222551 n_estimators:300 score_rmsle2:0.49299
loop 8 - max_depth:36 max_features:0.494602 n_estimators:300 score_rmsle2:0.36127
loop 9 - max_depth:78 max_features:0.650100 n_estimators:300 score_rmsle2:0.35002
loop 10 - max_depth:32 max_features:0.236526 n_estimators:300 score_rmsle2:0.44619
loop 11 - max_depth:13 max_features:0.625435 n_estimators:300 score_rmsle2:0.35683
loop 12 - max_depth:94 max_features:0.585608 n_estimators:300 score_rmsle2:0.35520
loop 13 - max_d

loop 100 - max_depth:74 max_features:0.983339 n_estimators:300 score_rmsle2:0.35115


In [18]:
min_rmsle

0.34801359519960473

In [19]:
min_data

[19, 0.7868663982913331, 300]

In [15]:
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(random_state=123,
                            max_depth=min_data[0],
                            max_features=min_data[1],
                            n_jobs=-1,
                            n_estimators=min_data[2]
                            )

In [16]:
model.fit(x,y_log_casual)
log_casual_predlist=model.predict(x_test)
model.fit(x,y_log_registered)
log_registered_predlist=model.predict(x_test)
casual_predlist=np.exp(log_casual_predlist)-1
registered_predlist=np.exp(log_registered_predlist)-1
predlist=casual_predlist+registered_predlist
predlist

array([11.04327043,  5.07773264,  2.62504078, ..., 99.81516973,
       94.38288835, 50.15274836])

In [17]:
submit["count"]=predlist
submit.to_csv("randfore_hyperparameters_final.csv",index=False)