### Imports ###

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Loading Datasets ### 

In [2]:
X = pd.read_csv('../data/x_004.csv', index_col = 0)
y = pd.read_csv('../data/y_004.csv', index_col = 0)
y = y.values.ravel()
x_test = pd.read_csv('../data/x_test_004.csv', index_col = 0)

### Model ###

In [3]:
#model = LinearRegression(random_state = 42)
#model = RandomForestRegressor(random_state = 42)
model = GradientBoostingRegressor(random_state = 42)

### Train-Test-Split ###

In [4]:
# Create train and test (validation) samples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
%%time

train = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

CPU times: user 2.83 s, sys: 31.2 ms, total: 2.86 s
Wall time: 2.96 s


629.5878165984302

In [6]:
%%time

train_final = model.fit(X, y)
y_pred_final = model.predict(x_test)

CPU times: user 4.11 s, sys: 74.8 ms, total: 4.19 s
Wall time: 4.84 s


### Cross Validation ###

In [7]:
%%time

scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5,
                         n_jobs=-1)

print(type(model), '\n')
print(scores, '\n')
print(np.mean(-scores), '\n')

<class 'sklearn.ensemble._gb.GradientBoostingRegressor'> 

[-658.94974367 -622.09484858 -615.74882842 -625.48207746 -619.29574149] 

628.3142479245123 

CPU times: user 49.6 ms, sys: 90.5 ms, total: 140 ms
Wall time: 11.4 s


### Grid Search ###

In [8]:
%%time

param_grid = {'n_estimators': [16, 32, 64, 128, 256, 512],
              'max_depth': [2, 4, 8, 16]}

grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = GradientBoostingRegressor(**best_params)

#best_model.fit(X_train, y_train)

print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', grid_search.best_score_, '\n')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best hyperparameters:  {'max_depth': 4, 'n_estimators': 256} 

Best score:  -548.9368058746908 

CPU times: user 9.75 s, sys: 202 ms, total: 9.96 s
Wall time: 9min 30s
[CV 2/5] END ..max_depth=2, n_estimators=16;, score=-1537.220 total time=   0.4s
[CV 2/5] END ..max_depth=2, n_estimators=32;, score=-1132.529 total time=   0.8s
[CV 5/5] END ..max_depth=2, n_estimators=32;, score=-1209.658 total time=   0.8s
[CV 4/5] END ...max_depth=2, n_estimators=64;, score=-861.105 total time=   2.6s
[CV 3/5] END ..max_depth=2, n_estimators=128;, score=-764.859 total time=   4.3s
[CV 2/5] END ..max_depth=2, n_estimators=256;, score=-665.585 total time=   8.5s
[CV 1/5] END ..max_depth=2, n_estimators=512;, score=-666.076 total time=  18.2s
[CV 5/5] END ..max_depth=2, n_estimators=512;, score=-670.586 total time=  18.4s
[CV 3/5] END ..max_depth=4, n_estimators=128;, score=-570.369 total time=   6.5s
[CV 2/5] END ..max_depth=4, n_estimators=

In [9]:
%%time

best_model.fit(X,y)

y_pred_grid = best_model.predict(x_test)

print('Best score: ', grid_search.best_score_, '\n')

Best score:  -548.9368058746908 

CPU times: user 13.5 s, sys: 240 ms, total: 13.7 s
Wall time: 15 s


### Submission ###

In [None]:
# Creating dataframe
submission = pd.DataFrame(y_pred_grid, columns=['price'])
submission = submission.rename_axis('id')

# Final csv
submission.to_csv('../submmissions/sub_021.csv')