In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import learning_curve, GridSearchCV

In [2]:
!ls

First look.ipynb            test.csv
Gradient Booster_fail.ipynb train.csv
README.md                   train_final.csv
Random Forest_fail.ipynb    train_mod.csv
Random forest_good.ipynb


In [3]:
data = pd.read_csv("./train_final.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,x,y,z,price,color_num,cut_num,clarity_num
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,4,3,3
1,1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,7,5,3
2,2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,2,5,4
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,3,5,6
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,1,4,4


In [5]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,carat,depth,table,x,y,z,price,color_num,cut_num,clarity_num
0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,4,3,3
1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,7,5,3
2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,2,5,4
3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,3,5,6
4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,1,4,4


In [7]:
# Split target and predictors
X = data.drop('price', axis=1)
y = data['price']


# split data in train/test 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=123)

In [8]:
model = RandomForestRegressor()
parameter_space = {'n_estimators': [100, 300, 1000],
                   'max_features': ['sqrt', 0.5, None],
                   'max_depth': [None, 10, 30, 100],
                   'min_samples_leaf': [1, 3, 10]}

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           verbose=1,
                           n_jobs=-1,
                           cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 36.4min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 30, 100],
                         'max_features': ['sqrt', 0.5, None],
                         'min_samples_leaf': [1, 3, 10],
                         'n_estimators': [100, 300, 1000]},
             verbose=1)

In [9]:
# State the final Accuracy of the selected model.
grid_search.best_score_

0.9916912332844878

In [10]:
# We try that in the test split:
best_rf = grid_search.best_estimator_
best_rf.score(X_test, y_test)

0.9915556265651752

In [11]:
grid_search.best_params_

{'max_depth': 30,
 'max_features': 0.5,
 'min_samples_leaf': 1,
 'n_estimators': 1000}

In [12]:
# Finally, we try the model in the whole test!
best_rf.score(X, y)

0.9974382754129086

In [13]:
# Importing test set:
data_test = pd.read_csv("./test_final.csv")

In [14]:
data_test.drop("Unnamed: 0", axis=1, inplace=True)

In [15]:
data_test.head()

Unnamed: 0,carat,depth,table,x,y,z,color_num,cut_num,clarity_num
0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8
1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4
2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2
3,0.42,62.6,57.0,4.76,4.8,2.99,4,3,4
4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8


In [16]:
Z_test = data_test

In [17]:
data_test["price"] = best_rf.predict(Z_test)
data_test

Unnamed: 0,carat,depth,table,x,y,z,color_num,cut_num,clarity_num,price
0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8,6.847592
1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4,6.911770
2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2,8.251195
3,0.42,62.6,57.0,4.76,4.80,2.99,4,3,4,6.749106
4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8,7.720795
...,...,...,...,...,...,...,...,...,...,...
13480,0.55,61.7,56.4,5.26,5.30,3.25,5,5,3,7.311305
13481,1.12,60.6,59.0,6.77,6.70,4.08,3,4,4,8.670062
13482,0.37,61.5,57.0,4.63,4.60,2.84,7,5,3,6.692395
13483,0.54,59.9,63.0,5.25,5.30,3.16,6,2,3,7.249153


In [18]:
data_test.insert(0, 'id', range(len(data_test)))
data_test

Unnamed: 0,id,carat,depth,table,x,y,z,color_num,cut_num,clarity_num,price
0,0,0.33,61.9,55.0,4.44,4.42,2.74,3,5,8,6.847592
1,1,0.41,61.8,54.0,4.79,4.76,2.95,6,5,4,6.911770
2,2,0.91,62.5,59.0,6.16,6.23,3.87,6,3,2,8.251195
3,3,0.42,62.6,57.0,4.76,4.80,2.99,4,3,4,6.749106
4,4,0.54,61.5,56.0,5.28,5.25,3.24,4,5,8,7.720795
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,5,5,3,7.311305
13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,3,4,4,8.670062
13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,7,5,3,6.692395
13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,6,2,3,7.249153


In [19]:
data_submit = data_test[["id", "price"]]

In [20]:
data_submit

Unnamed: 0,id,price
0,0,6.847592
1,1,6.911770
2,2,8.251195
3,3,6.749106
4,4,7.720795
...,...,...
13480,13480,7.311305
13481,13481,8.670062
13482,13482,6.692395
13483,13483,7.249153


In [23]:
data_submit.set_index('id', inplace=True)

In [24]:
data_submit

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,6.847592
1,6.911770
2,8.251195
3,6.749106
4,7.720795
...,...
13480,7.311305
13481,8.670062
13482,6.692395
13483,7.249153


In [25]:
data_submit.to_csv("./submission_forest.csv")