# Kaggle Competition - Parameter selection & Testing

In [1]:
import pandas as pd
import numpy as np
import math

from process_fun import *
from train_fun import *

from sklearn.model_selection import train_test_split

### Preparing data

In [2]:
d_train = pd.read_csv("../input/diamonds-datamad0320/diamonds_train.csv")
d_test = pd.read_csv("../input/diamonds-datamad0320/diamonds_test.csv")

In [3]:
d_train.drop(columns=['x', 'y', 'z','Unnamed: 0'],inplace=True)

cutlis=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
to_num_con(d_train, 'cut', cutlis)

colorlis = ['J','I','H','G','F','E','D']
to_num_con(d_train, 'color', colorlis)

clalis = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
to_num_con(d_train, 'clarity', clalis) 

d_train.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.21,5,3,4,63.0,57.0,6134
1,0.28,3,7,6,64.0,56.0,532
2,0.42,4,5,5,61.2,58.0,1103


In [4]:
d_test.drop(columns=['x', 'y', 'z','Unnamed: 0'],inplace=True)

to_num_con(d_test, 'cut', cutlis)
to_num_con(d_test, 'color', colorlis)
to_num_con(d_test, 'clarity', clalis) 

d_test.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table
0,0.3,5,3,2,60.0,56.0
1,0.34,5,7,8,62.1,57.0
2,1.57,3,2,4,60.3,58.0


Split data to train the model:
* Standarized `X` `y` 

In [5]:
X = norm(d_train.drop(columns=['price']))
y = d_train['price']

In [6]:
X.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table
0,0.880093,0.982962,-0.833658,-0.035196,0.877395,-0.205924
1,-1.095267,-0.809998,1.525915,1.177942,1.579543,-0.65329
2,-0.797901,0.086482,0.346128,0.571373,-0.386471,0.241442


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
d_test = norm(d_test)
d_test.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table
0,-1.048154,0.97865,-0.805754,-1.238972,-1.203383,-0.65129
1,-0.964994,0.97865,1.521942,2.416515,0.248074,-0.202597
2,1.592168,-0.810376,-1.387678,-0.020476,-0.996032,0.246095


## 4. Select correct parameters

* The more acurated models are:
    - RandomForestRegressor
    - GradientBoostingRegressor
    - DecisionTreeRegressor
    - KNeighborsRegressor

#### DecisionTreeRegressor

In [9]:
dtparams = {'criterion':['mse','friedman_mse','mae'],'splitter':['best','random']}

In [10]:
grid(DecisionTreeRegressor(), dtparams, X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.5min finished


Best params:  {'criterion': 'mae', 'splitter': 'random'}
Best estimator:  DecisionTreeRegressor(ccp_alpha=0.0, criterion='mae', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='random')


'Best score: 0.9650146376483273'

In [12]:
dtmodel = DecisionTreeRegressor().set_params(ccp_alpha=0.0, criterion='mse', min_impurity_decrease=0.0, 
                                             min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, 
                                             presort='deprecated', splitter='best').fit(X_train, y_train)

Checking RMSE

In [13]:
dty_pred = dtmodel.predict(X_test)

In [14]:
math.sqrt(mean_squared_error(dty_pred,y_test))

744.9347873979948

In [15]:
DTReg = final_test(dtmodel, d_test)

#### KNeighborsRegressor

In [16]:
knparamters = {'n_neighbors':[11,12,13],'weights':['distance','uniform']}

In [17]:
grid(KNeighborsRegressor(), knparamters, X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best params:  {'n_neighbors': 12, 'weights': 'distance'}
Best estimator:  KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                    weights='distance')


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.1s finished


'Best score: 0.960174022680752'

In [18]:
knmodel = KNeighborsRegressor().set_params(algorithm='auto', leaf_size=30, metric='minkowski',
                                         metric_params=None, n_jobs=None, n_neighbors=12,
                                         p=2,weights='distance').fit(X_train, y_train)

Checking RMSE

In [19]:
kny_pred = knmodel.predict(X_test)

In [20]:
math.sqrt(mean_squared_error(kny_pred,y_test))

796.1350045517913

In [21]:
KNReg = final_test(knmodel, d_test)

#### GradientBoostingRegressor (Scaled)

In [22]:
gbparams = {'n_estimators':[100,200,300],'min_samples_leaf':[5,7,9]}

In [23]:
grid(GradientBoostingRegressor(), gbparams, X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.9min finished


Best params:  {'min_samples_leaf': 7, 'n_estimators': 300}
Best estimator:  GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=7, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=300,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


'Best score: 0.9797032904619922'

In [24]:
gbmodel = GradientBoostingRegressor().set_params(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                                                 learning_rate=0.1, loss='ls', max_depth=3,min_impurity_decrease=0.0, 
                                                 min_samples_leaf=5, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                                 n_estimators=300, subsample=1.0, tol=0.0001,validation_fraction=0.1, 
                                                 verbose=0, warm_start=False).fit(X_train, y_train)

Checking RMSE

In [25]:
gby_pred = gbmodel.predict(X_test)

In [26]:
math.sqrt(mean_squared_error(gby_pred,y_test))

580.5870119629052

In [27]:
GBReg = final_test(gbmodel, d_test)

#### RandomForestRegressor

In [34]:
rfparams = {'n_estimators':[10,55,100],'min_samples_leaf':[8,10,12]}

In [35]:
grid(RandomForestRegressor(), rfparams, X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.1min finished


Best params:  {'min_samples_leaf': 8, 'n_estimators': 55}
Best estimator:  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=8,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=55, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


'Best score: 0.9797617915394277'

In [36]:
rfmodel = RandomForestRegressor().set_params(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_features='auto', 
                                             min_impurity_decrease=0.0,min_samples_leaf=12, min_samples_split=2, 
                                             min_weight_fraction_leaf=0.0, n_estimators=55, verbose=1, 
                                             warm_start=False).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    3.4s finished


Checking RMSE

In [37]:
rfy_pred = rfmodel.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    0.1s finished


In [38]:
math.sqrt(mean_squared_error(rfy_pred,y_test))

585.0880545195065

In [39]:
RFReg = final_test(rfmodel, d_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    0.1s finished


In [40]:
RFReg.head()

Unnamed: 0,id,price
0,0,444.622031
1,1,1270.29129
2,2,9210.0943
3,3,563.171117
4,4,9507.169029


## 4. Test model

Treat the `d_test` dataset as `d_train`

In [41]:
RFReg.to_csv('../output/sample_submission.csv',index=False)

In [41]:
GBReg.to_csv('../output/sample_submission.csv',index=False)