In [6]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv("./train_final.csv")

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,x,y,z,price,color_num,cut_num,clarity_num
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,4,3,3
1,1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,7,5,3
2,2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,2,5,4
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,3,5,6
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,1,4,4


In [9]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [10]:
data.head()

Unnamed: 0,carat,depth,table,x,y,z,price,color_num,cut_num,clarity_num
0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,4,3,3
1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,7,5,3
2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,2,5,4
3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,3,5,6
4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,1,4,4


In [11]:
# Split target and predictors
X = data.drop('price', axis=1)
y = data['price']


# split data in train/test 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=123)

In [12]:
model = GradientBoostingRegressor(n_estimators=3000)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [4, 6],
          'min_samples_leaf': [3, 5, 9, 17],
          'max_features': [1, 0.3, 0.1]}

grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)

grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 96 candidates, totalling 192 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done 192 out of 192 | elapsed: 16.3min finished


GridSearchCV(cv=2, estimator=GradientBoostingRegressor(n_estimators=3000),
             n_jobs=3,
             param_grid={'learning_rate': [0.1, 0.05, 0.02, 0.01],
                         'max_depth': [4, 6], 'max_features': [1, 0.3, 0.1],
                         'min_samples_leaf': [3, 5, 9, 17]},
             verbose=1)

In [14]:
params = {'n_estimators': [6000],
          'learning_rate': [0.013924766500838348],
          'max_depth': [grid_search.best_params_['max_depth']],
          'min_samples_leaf': [grid_search.best_params_['min_samples_leaf']],
          'max_features': [grid_search.best_params_['max_features']]}

grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=5,
                           n_jobs=3,
                           verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  2.2min finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(n_estimators=3000),
             n_jobs=3,
             param_grid={'learning_rate': [0.013924766500838348],
                         'max_depth': [6], 'max_features': [0.3],
                         'min_samples_leaf': [3], 'n_estimators': [6000]},
             verbose=1)

In [15]:
# State the final Accuracy of the selected model for the train split
grid_search.best_score_

0.9928744497401795

In [16]:
best_rf = grid_search.best_estimator_
best_rf.score(X_test, y_test)

0.9926998377533265

In [17]:
# Finally, we try the model in the whole test!
best_rf.score(X, y)

0.9955195735152658

In [18]:
# Importing test set:
data_test = pd.read_csv("./test_final.csv")

In [20]:
data_test.drop("Unnamed: 0", axis=1, inplace=True)

In [21]:
data_test.head()

Unnamed: 0,carat,depth,table,x,y,z,color_num,cut_num,clarity_num
0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8
1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4
2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2
3,0.42,62.6,57.0,4.76,4.8,2.99,4,3,4
4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8


In [23]:
Z_test = data_test

In [25]:
data_test["price"] = best_rf.predict(Z_test)
data_test

Unnamed: 0,carat,depth,table,x,y,z,color_num,cut_num,clarity_num,price
0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8,6.888326
1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4,6.937136
2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2,8.256394
3,0.42,62.6,57.0,4.76,4.80,2.99,4,3,4,6.712405
4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8,7.766681
...,...,...,...,...,...,...,...,...,...,...
13480,0.55,61.7,56.4,5.26,5.30,3.25,5,5,3,7.310851
13481,1.12,60.6,59.0,6.77,6.70,4.08,3,4,4,8.658846
13482,0.37,61.5,57.0,4.63,4.60,2.84,7,5,3,6.741228
13483,0.54,59.9,63.0,5.25,5.30,3.16,6,2,3,7.278928


In [28]:
data_test.insert(0, 'id', range(len(data_test)))
data_test

Unnamed: 0,id,carat,depth,table,x,y,z,color_num,cut_num,clarity_num,price
0,0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8,6.888326
1,1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4,6.937136
2,2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2,8.256394
3,3,0.42,62.6,57.0,4.76,4.80,2.99,4,3,4,6.712405
4,4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8,7.766681
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,5,5,3,7.310851
13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,3,4,4,8.658846
13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,7,5,3,6.741228
13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,6,2,3,7.278928


In [31]:
data_submit = data_test[["id", "price"]]

In [37]:
data_submit

Unnamed: 0,id,price
0,0,6.888326
1,1,6.937136
2,2,8.256394
3,3,6.712405
4,4,7.766681
...,...,...
13480,13480,7.310851
13481,13481,8.658846
13482,13482,6.741228
13483,13483,7.278928


In [38]:
data_submit.set_index('id', inplace=True)

In [39]:
data_submit.to_csv("./submission_gradient.csv")

In [42]:
# Let's tweak this a little bit more!
params = {'n_estimators': [7500, 8000, 9000],
          'learning_rate': np.logspace(-1, -3, num=10) * 3,
          'max_depth': [6],
          'min_samples_leaf': [3],
          'max_features': [0.3]}

grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=5,
                           n_jobs=3,
                           verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 28.1min
[Parallel(n_jobs=3)]: Done 150 out of 150 | elapsed: 87.1min finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(n_estimators=3000),
             n_jobs=3,
             param_grid={'learning_rate': array([0.3       , 0.17984528, 0.10781441, 0.06463304, 0.03874649,
       0.02322791, 0.01392477, 0.00834768, 0.0050043 , 0.003     ]),
                         'max_depth': [6], 'max_features': [0.3],
                         'min_samples_leaf': [3],
                         'n_estimators': [7500, 8000, 9000]},
             verbose=1)

In [49]:
grid_search.best_estimator_

GradientBoostingRegressor(learning_rate=0.008347678206621377, max_depth=6,
                          max_features=0.3, min_samples_leaf=3,
                          n_estimators=9000)

In [47]:
best_rf = grid_search.best_estimator_
best_rf.score(X_test, y_test)

0.9927270397691542

In [48]:
# Finally, we try the model in the whole test!
best_rf.score(X, y)

0.995388064140965

In [50]:
# Let's tweak this a little bit more!
params = {'n_estimators': [15000],
          'learning_rate': np.logspace(-1, -3, num=10) * 3,
          'max_depth': [6],
          'min_samples_leaf': [3],
          'max_features': [0.3]}

grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=5,
                           n_jobs=3,
                           verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 42.4min
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed: 48.0min finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(n_estimators=3000),
             n_jobs=3,
             param_grid={'learning_rate': array([0.3       , 0.17984528, 0.10781441, 0.06463304, 0.03874649,
       0.02322791, 0.01392477, 0.00834768, 0.0050043 , 0.003     ]),
                         'max_depth': [6], 'max_features': [0.3],
                         'min_samples_leaf': [3], 'n_estimators': [15000]},
             verbose=1)

In [60]:
best_rf = grid_search.best_estimator_
best_rf.score(X_test, y_test)

0.992729575773177

In [61]:
# Our score improved slightly!!
best_rf.score(X, y)

0.9960612796032512

In [58]:
data_test.drop('price', axis=1, inplace=True)

In [62]:
data_test.drop('id', axis=1, inplace=True)

In [63]:
data_test["price"] = best_rf.predict(Z_test)
data_test

Unnamed: 0,carat,depth,table,x,y,z,color_num,cut_num,clarity_num,price
0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8,6.890797
1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4,6.946350
2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2,8.255948
3,0.42,62.6,57.0,4.76,4.80,2.99,4,3,4,6.726772
4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8,7.777579
...,...,...,...,...,...,...,...,...,...,...
13480,0.55,61.7,56.4,5.26,5.30,3.25,5,5,3,7.310377
13481,1.12,60.6,59.0,6.77,6.70,4.08,3,4,4,8.640663
13482,0.37,61.5,57.0,4.63,4.60,2.84,7,5,3,6.745203
13483,0.54,59.9,63.0,5.25,5.30,3.16,6,2,3,7.279994


In [64]:
data_test.insert(0, 'id', range(len(data_test)))
data_test

Unnamed: 0,id,carat,depth,table,x,y,z,color_num,cut_num,clarity_num,price
0,0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8,6.890797
1,1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4,6.946350
2,2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2,8.255948
3,3,0.42,62.6,57.0,4.76,4.80,2.99,4,3,4,6.726772
4,4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8,7.777579
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,5,5,3,7.310377
13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,3,4,4,8.640663
13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,7,5,3,6.745203
13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,6,2,3,7.279994


In [65]:
data_submit_1 = data_test[["id", "price"]]

In [67]:
data_submit_1

Unnamed: 0,id,price
0,0,6.890797
1,1,6.946350
2,2,8.255948
3,3,6.726772
4,4,7.777579
...,...,...
13480,13480,7.310377
13481,13481,8.640663
13482,13482,6.745203
13483,13483,7.279994


In [68]:
data_submit_1.set_index('id', inplace=True)

In [69]:
data_submit.to_csv("./submission_gradient_improved.csv")