## Gradient Boosting Machine (Regression) - Seoul Bike Sharing Data

Carico moduli necessari:

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from utils.pyutils import rmse
from utils.pyutils import round_pred
from utils.pyutils import print_model_scores

import pickle
import pandas as pd
import numpy as np
import itertools

Carico dati:

In [3]:
bike_train = pd.read_csv("data/bike_train_dummy.csv")
bike_test = pd.read_csv("data/bike_test_dummy.csv")
bike_valid = pd.read_csv("data/bike_valid_dummy.csv")

## GBM - all predictors

Converti i dati in array numpy:

In [4]:
columns = bike_train.columns

X_train, y_train = np.array(bike_train[[col for col in columns if col != "rented_bike_count"]]), np.array(bike_train["rented_bike_count"])
X_test, y_test = np.array(bike_test[[col for col in columns if col != "rented_bike_count"]]), np.array(bike_test["rented_bike_count"])
X_valid, y_valid = np.array(bike_valid[[col for col in columns if col != "rented_bike_count"]]), np.array(bike_valid["rented_bike_count"])

### Default model

In [8]:
bike_GBM1 = GradientBoostingRegressor()
bike_GBM1.fit(X_train, y_train)

GradientBoostingRegressor()

Save / Load model

In [10]:
pkl_path = "models\\bike_GBM1.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_GBM1, file)

with open(pkl_path, 'rb') as file:
    bike_GBM1 = pickle.load(file)

#### RMSE

In [6]:
print_model_scores(bike_GBM1, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  230.04474578701937
Validation RMSE:  251.16560991062354
Testing RMSE:  249.5413898557543


### Hyperparameter Search 1

In [11]:
max_depth = [2, 3, 4, 5, 6]
learning_rate = [0.05, 0.7, 0.1, 0.12, 0.15]
n_estimators = [50, 100, 150, 300, 500]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(max_depth, learning_rate, n_estimators))

In [12]:
for hp in hyper_grid:
    model = GradientBoostingRegressor(max_depth=hp[0], learning_rate=hp[1], n_estimators=hp[2])
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [13]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["max_depth", "learning_rate", "n_estimators"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [3]:
# hyper_grid_df.to_csv("models\\gbm1_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\gbm1_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,max_depth,learning_rate,n_estimators,train_rmse,valid_rmse
119,119,6,0.12,500,31.717656,159.881075
114,114,6,0.1,500,38.81491,160.051904
118,118,6,0.12,300,53.397335,162.551704
124,124,6,0.15,500,23.864071,162.595653
113,113,6,0.1,300,60.662732,163.241209
123,123,6,0.15,300,44.479624,163.754059
104,104,6,0.05,500,74.510491,163.936948
94,94,5,0.12,500,60.81383,166.807112
99,99,5,0.15,500,50.261946,167.476693
117,117,6,0.12,150,84.424403,167.527768


#### Fit best model

In [15]:
bike_GBM1_tuned = GradientBoostingRegressor(max_depth=6, learning_rate=0.12, n_estimators=500)
bike_GBM1_tuned.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.12, max_depth=6, n_estimators=500)

Save / Load model 

In [30]:
pkl_path = "models\\bike_GBM1_tuned.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_GBM1_tuned, file)

with open(pkl_path, 'rb') as file:
    bike_GBM1_tuned = pickle.load(file)

#### RMSE

In [32]:
print_model_scores(bike_GBM1_tuned, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  31.7176555217003
Validation RMSE:  160.00716930672132
Testing RMSE:  161.63410324969584


#### Cross Validation

In [21]:
scores = cross_val_score(bike_GBM1_tuned, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
GBM_rmse_scores = np.sqrt(-scores)

In [22]:
# GBM1_tuned_cross_val_scores = pd.DataFrame({
#     "cross_val_score": GBM_rmse_scores,
#     "scoring": "rmse",
#     "cv": 10, 
#     "model": "bike_GBM1_tuned_reg", 
#     })
# GBM1_tuned_cross_val_scores.to_csv("models/gbm1_cross_val_scores.csv", index=False)
# GBM1_tuned_reg_cross_val_scores = pd.read_csv("models/gbm1_cross_val_scores.csv")

In [23]:
print("Scores: ", GBM_rmse_scores)
print("Mean: ", np.mean(GBM_rmse_scores))
print("Standard Deviation: ", np.std(GBM_rmse_scores))

Scores:  [145.80987338 145.08725612 168.93147193 152.06736803 131.89615297
 150.31968226 169.45206528 154.69815485 150.06376914 148.98822315]
Mean:  151.73140171085024
Standard Deviation:  10.505827288004577


#### Regularization 

Each individual learner uses a random subsample of the training data (`subsample`)

##### Hyperparameter Search

In [24]:
vals = list(range(40, 100, 5))
subsample = [num / 100 for num in vals]

train_rmse = list()
valid_rmse = list()

hyper_grid = subsample

In [25]:
for hp in hyper_grid:
    model = GradientBoostingRegressor(max_depth=6, learning_rate=0.12, n_estimators=500,
                                        subsample=hp)
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [36]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["subsample"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [4]:
# hyper_grid_df.to_csv("models\\gbm1_tuned_reg.csv")
hyper_grid_df = pd.read_csv("models\\gbm1_tuned_reg.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,subsample,train_rmse,valid_rmse
8,8,0.8,24.936218,156.8169
5,5,0.65,25.765767,157.556943
10,10,0.9,25.635831,157.900453
11,11,0.95,25.773286,158.121655
7,7,0.75,24.852229,158.793781
6,6,0.7,24.681594,159.682689
4,4,0.6,27.587042,159.780989
9,9,0.85,24.934577,159.811031
0,0,0.4,32.61597,162.2327
3,3,0.55,27.715396,165.052518


##### Fit best model

In [39]:
bike_GBM1_tuned_reg = GradientBoostingRegressor(max_depth=6, learning_rate=0.12, n_estimators=500,
                                                subsample=0.80)
bike_GBM1_tuned_reg.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.12, max_depth=6, n_estimators=500,
                          subsample=0.8)

Save / Load model 

In [40]:
pkl_path = "models\\bike_GBM1_tuned_reg.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_GBM1_tuned_reg, file)

with open(pkl_path, 'rb') as file:
    bike_GBM1_tuned_reg = pickle.load(file)

##### RMSE

In [44]:
print_model_scores(bike_GBM1_tuned_reg, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  24.917329619441194
Validation RMSE:  159.15829451842575
Testing RMSE:  160.95926765722083


##### Cross Validation

In [45]:
scores = cross_val_score(bike_GBM1_tuned_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
GBM_rmse_scores = np.sqrt(-scores)

In [46]:
# GBM1_tuned_reg_cross_val_scores = pd.DataFrame({
#     "cross_val_score": GBM_rmse_scores,
#     "scoring": "rmse",
#     "cv": 10, 
#     "model": "bike_GBM1_tuned_reg", 
#     })
# GBM1_tuned_reg_cross_val_scores.to_csv("models/gbm1_reg_cross_val_scores.csv", index=False)
# GBM1_tuned_reg_cross_val_scores = pd.read_csv("models/gbm1_reg_cross_val_scores.csv")

In [47]:
print("Scores: ", GBM_rmse_scores)
print("Mean: ", np.mean(GBM_rmse_scores))
print("Standard Deviation: ", np.std(GBM_rmse_scores))

Scores:  [151.61476807 146.56150942 162.12701536 160.58257761 134.68349727
 147.74535559 168.93239268 161.78865221 156.38398593 150.07774697]
Mean:  154.0497501097631
Standard Deviation:  9.419815263151794


### Hyperparameter Search 2

In [48]:
max_depth = [5, 6, 7, 8]
learning_rate = [0.08, 0.1, 0.12, 0.14]
n_estimators = [400, 500, 600, 800]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(max_depth, learning_rate, n_estimators))

In [49]:
for hp in hyper_grid:
    model = GradientBoostingRegressor(max_depth=hp[0], learning_rate=hp[1], n_estimators=hp[2])

    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [50]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["max_depth", "learning_rate", "n_estimators"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [5]:
# hyper_grid_df.to_csv("models\\gbm1_tuned2_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\gbm1_tuned2_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,max_depth,learning_rate,n_estimators,train_rmse,valid_rmse
43,43,7,0.12,800,5.477718,150.53429
42,42,7,0.12,600,10.1246,151.51817
41,41,7,0.12,500,14.314443,152.580093
40,40,7,0.12,400,20.087872,152.870082
39,39,7,0.1,800,8.420475,153.87836
37,37,7,0.1,500,19.513238,154.48377
36,36,7,0.1,400,25.96638,154.684366
38,38,7,0.1,600,14.467286,154.912371
34,34,7,0.08,600,20.131717,156.153998
35,35,7,0.08,800,12.679784,156.65067


#### Fit best model

In [53]:
bike_GBM1_tuned2 = GradientBoostingRegressor(max_depth=7, learning_rate=0.12, n_estimators=800)
bike_GBM1_tuned2.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.12, max_depth=7, n_estimators=800)

Save / Load model 

In [54]:
pkl_path = "models\\bike_GBM1_tuned2.pkl"

In [55]:
# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_GBM1_tuned2, file)

In [56]:
with open(pkl_path, 'rb') as file:
    bike_GBM1_tuned2 = pickle.load(file)

#### RMSE

In [57]:
print_model_scores(bike_GBM1_tuned2, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  5.477717666654564
Validation RMSE:  151.7382207446414
Testing RMSE:  162.30323879763125


#### Cross Validation

In [58]:
scores = cross_val_score(bike_GBM1_tuned2, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
GBM_rmse_scores = np.sqrt(-scores)

In [59]:
# GBM1_tuned2_cross_val_scores = pd.DataFrame({
#     "cross_val_score": GBM_rmse_scores,
#     "scoring": "rmse",
#     "cv": 10, 
#     "model": "bike_GBM1_tuned", 
#     })
# GBM1_tuned2_cross_val_scores.to_csv("models/gbm1_tuned2_cross_val_scores.csv", index=False)
# GBM1_tuned2_cross_val_scores = pd.read_csv("models/gbm1_tuned2_cross_val_scores.csv")

In [60]:
print("Scores: ", GBM_rmse_scores)
print("Mean: ", np.mean(GBM_rmse_scores))
print("Standard Deviation: ", np.std(GBM_rmse_scores))

Scores:  [150.10842711 149.49591649 158.50303441 163.11327852 130.73205886
 143.9944779  168.95959574 169.37555189 148.1913733  148.21480025]
Mean:  153.06885144913846
Standard Deviation:  11.397382305582923


#### Regularization

Regolarizzazione con grid search dell'iperparametro `subsample` e fissando i valori degli altri iperparametri a quelli trovati in Hyperparameter Seach 2.

##### Hyperparameter Search (`subsample`)

In [1]:
vals = list(range(40, 100, 5))
subsample = [num / 100 for num in vals]

train_rmse = list()
valid_rmse = list()

hyper_grid = subsample

In [5]:
for hp in hyper_grid:
    model = GradientBoostingRegressor(max_depth=7, learning_rate=0.12, n_estimators=800,
                                        subsample=hp)
    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [6]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["subsample"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [8]:
#hyper_grid_df.to_csv("models\\gbm1_tuned2_reg.csv")
hyper_grid_df = pd.read_csv("models\\gbm1_tuned2_reg.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,subsample,train_rmse,valid_rmse
8,8,0.8,3.577008,154.311394
10,10,0.9,3.57113,154.604064
4,4,0.6,4.323655,155.70332
5,5,0.65,4.192283,155.793854
11,11,0.95,4.03017,156.008001
6,6,0.7,3.825221,156.041635
7,7,0.75,3.671072,156.810566
3,3,0.55,4.779538,157.616535
1,1,0.45,6.278334,159.390459
2,2,0.5,5.489428,160.299065


#### Fit best model

In [11]:
bike_GBM1_tuned2_reg = GradientBoostingRegressor(max_depth=7, learning_rate=0.12, n_estimators=800,
                                                subsample=0.80)
bike_GBM1_tuned2_reg.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.12, max_depth=7, n_estimators=800,
                          subsample=0.8)

Save / Load model 

In [13]:
pkl_path = "models\\bike_GBM1_tuned2_reg.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_GBM1_tuned2_reg, file)

with open(pkl_path, 'rb') as file:
    bike_GBM1_tuned2_reg = pickle.load(file)

#### RMSE

In [14]:
print_model_scores(bike_GBM1_tuned2_reg, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  3.4570228257593945
Validation RMSE:  152.2783366730515
Testing RMSE:  151.2023878794797


#### Cross Validation

In [15]:
scores = cross_val_score(bike_GBM1_tuned2_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
GBM_rmse_scores = np.sqrt(-scores)

In [16]:
# GBM1_tuned2_reg_cross_val_scores = pd.DataFrame({
#     "cross_val_score": GBM_rmse_scores,
#     "scoring": "rmse",
#     "cv": 10, 
#     "model": "bike_GBM1_tuned2_reg", 
#     })
# GBM1_tuned2_reg_cross_val_scores.to_csv("models/gbm1_tuned2_reg_cross_val_scores.csv", index=False)
# GBM1_tuned2_reg_cross_val_scores = pd.read_csv("models/gbm1_tuned2_reg_cross_val_scores.csv")

In [17]:
print("Scores: ", GBM_rmse_scores)
print("Mean: ", np.mean(GBM_rmse_scores))
print("Standard Deviation: ", np.std(GBM_rmse_scores))

Scores:  [153.47251168 147.72753481 159.09276307 159.76651402 137.16457953
 155.25031572 171.61557114 160.32567599 150.9720689  144.03419982]
Mean:  153.94217346791325
Standard Deviation:  9.179217173074942


## GBM - selected predictors

Converti i dati in array numpy:

In [61]:
y_col = "rented_bike_count"
X_col = ["hour", "temperature", "humidity", "functioning_day_Yes", "seasons_Winter", 
"dew_point_temperature", "solar_radiation", "rainfall"]

X_train, y_train = np.array(bike_train[X_col]), np.array(bike_train[y_col])
X_test, y_test = np.array(bike_test[X_col]), np.array(bike_test[y_col])
X_valid, y_valid = np.array(bike_valid[X_col]), np.array(bike_valid[y_col])

##### Default model

In [62]:
bike_GBM2 = GradientBoostingRegressor()
bike_GBM2.fit(X_train, y_train)

GradientBoostingRegressor()

In [76]:
print_model_scores(bike_GBM2, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  248.41140460822723
Validation RMSE:  269.1756056150182
Testing RMSE:  264.42802060710545


##### Hyperparameter Search

In [65]:
max_depth = [2, 3, 4, 5, 6]
learning_rate = [0.05, 0.7, 0.1, 0.12, 0.15]
n_estimators = [50, 100, 150, 300, 500]

train_rmse = list()
valid_rmse = list()

hyper_grid = list(itertools.product(max_depth, learning_rate, n_estimators))

In [66]:
for hp in hyper_grid:
    model = GradientBoostingRegressor(max_depth=hp[0], learning_rate=hp[1], n_estimators=hp[2])

    model.fit(X_train, y_train)

    pred_train = round_pred(model, X_train)
    pred_valid = round_pred(model, X_valid)
    
    train_rmse.append(rmse(y_train, pred_train))
    valid_rmse.append(rmse(y_valid, pred_valid))

In [67]:
hyper_grid_df = pd.DataFrame(hyper_grid)
hyper_grid_df.columns = ["max_depth", "learning_rate", "n_estimators"]
hyper_grid_df["train_rmse"] = train_rmse
hyper_grid_df["valid_rmse"] = valid_rmse

Carica hyperparameter grid da csv:

In [6]:
# hyper_grid_df.to_csv("models\\gbm2_hypergrid.csv")
hyper_grid_df = pd.read_csv("models\\gbm2_hypergrid.csv")
hyper_grid_df.sort_values(by = "valid_rmse").head(10)

Unnamed: 0.1,Unnamed: 0,max_depth,learning_rate,n_estimators,train_rmse,valid_rmse
103,103,6,0.05,300,160.43714,243.37895
111,111,6,0.1,100,178.139419,243.536772
110,110,6,0.1,50,200.218586,243.831362
112,112,6,0.1,150,157.746447,244.354093
102,102,6,0.05,150,187.588261,244.501097
121,121,6,0.15,100,156.84118,244.746954
113,113,6,0.1,300,113.562955,245.02417
101,101,6,0.05,100,199.145751,245.280163
120,120,6,0.15,50,188.136065,245.682556
104,104,6,0.05,500,127.179816,245.692695


##### Fit best model

In [70]:
bike_GBM2_tuned = GradientBoostingRegressor(max_depth=6, learning_rate=0.05, n_estimators=300)
bike_GBM2_tuned.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, max_depth=6, n_estimators=300)

Save/Load model

In [71]:
pkl_path = "models\\bike_GBM2_tuned.pkl"

# with open(pkl_path, 'wb') as file:
#     pickle.dump(bike_GBM2_tuned, file)

with open(pkl_path, 'rb') as file:
    bike_GBM2_tuned = pickle.load(file)

#### RMSE

In [75]:
print_model_scores(bike_GBM2_tuned, X_train, X_valid, X_test, y_train, y_valid, y_test)

Training RMSE:  160.43713968033114
Validation RMSE:  243.28275273967958
Testing RMSE:  245.88486419126363
