In [2]:
from codes import open_file
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB 
from sklearn.datasets import load_digits 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV


In [3]:
d_train = open_file("../Output/d_train.csv")

In [4]:
d_train.shape
d_train.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,cut_c,color_c,clarity_c,y
0,0,0,1.21,63.0,57.0,4,2,3,6134
1,1,1,0.28,64.0,56.0,2,6,5,532
2,2,2,0.42,61.2,58.0,3,4,4,1103
3,3,3,0.26,61.1,57.0,4,2,7,600
4,4,4,1.1,63.4,57.0,1,3,2,4997


In [5]:
d_test = open_file("../Output/d_test.csv")

In [6]:
d_test.shape

(13449, 8)

In [7]:
d_test.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,cut_c,color_c,clarity_c
0,0,0,0.3,60.0,56.0,4,2,1
1,1,1,0.34,62.1,57.0,4,6,7
2,2,2,1.57,60.3,58.0,2,1,3
3,3,3,0.31,61.8,57.0,4,2,3
4,4,4,1.51,64.0,60.0,1,1,6


# Train

In [8]:
X = d_train.drop(columns=['y'])
y = d_train['y']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

# Model

In [10]:
gbrt = GradientBoostingRegressor(n_estimators=100)

In [11]:
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [12]:
y_pred = gbrt.predict(d_test)

# Evaluate

In [13]:
print("Feature Importances") 
print(gbrt.feature_importances_) 

Feature Importances
[0.00000000e+00 0.00000000e+00 9.03702688e-01 7.72002927e-04
 9.36388517e-05 7.93626170e-04 2.85132341e-02 6.61248100e-02]


In [35]:
print("R-squared for Train: %.2f" %gbrt.score(X_train, y_train))
print("R-squared for Test: %.2f" %gbrt.score(X_test, y_test))

R-squared for Train: 0.98
R-squared for Test: 0.98


In [36]:
def GradientBooster(param_grid, n_jobs):
    estimator = GradientBoostingRegressor()
    cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=param_grid, n_jobs=n_jobs)
    classifier.fit(X_train, y_train)
    print("Best Estimator learned through GridSearch")
    print(classifier.best_estimator_)
    return cv, classifier.best_estimator_

In [37]:
param_grid={'n_estimators':[100],'learning_rate':[0.1],'max_depth':[6],'min_samples_leaf':[3],'max_features':[1.0]} 
n_jobs=4 

In [38]:
cv, best_est = GradientBooster(param_grid, n_jobs)

Best Estimator learned through GridSearch
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=6,
                          max_features=1.0, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


In [39]:
"n_estimators: %d" %best_est.n_estimators

'n_estimators: 100'

In [40]:
"max_depth: %d" %best_est.max_depth

'max_depth: 6'

In [41]:
"Learning Rate: %.1f" %best_est.learning_rate

'Learning Rate: 0.1'

In [42]:
"min_samples_leaf: %d" %best_est.min_samples_leaf


'min_samples_leaf: 3'

In [43]:
"max_features: %.1f" %best_est.max_features

'max_features: 1.0'

In [44]:
"Train R-squared: %.2f" %best_est.score(X_train,y_train)

'Train R-squared: 0.99'

# New model parameters

In [45]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, min_samples_leaf=3,max_features=1)

In [46]:
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=6,
                          max_features=1, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
y_pred = model.predict(d_test)

# Export to CSV

In [50]:
result = pd.DataFrame({"id":range(len(y_pred)),"price":y_pred})

In [51]:
print(result.shape)
result.head()

(13449, 2)


Unnamed: 0,id,price
0,0,1590.940811
1,1,2269.633339
2,2,8307.631492
3,3,221.732738
4,4,9470.587271


In [52]:
d_train.y.mean()

3825.6425538406775

In [53]:
result.price.mean()

3950.6825596598765

In [54]:
result.to_csv("../Output/test_m4.csv", index=False)