## Imports

In [1]:
import numpy as np
import pandas as pd

#SCI-kit learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

## Load data

In [2]:
diamonds = pd.read_csv('''/home/david/Documents//learning_repositories/dataptmad-0420-classes/week_14/ml_guided_lesson/data/diamonds_train.csv''')
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76


In [3]:
diamonds_predict = pd.read_csv('''/home/david/Documents/learning_repositories/dataptmad-0420-classes/week_14/ml_guided_lesson/data/diamonds_predict.csv''')
diamonds_predict

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45


## Feature engineering

In [4]:
target = 'price'

cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
feats = num_features + cat_features

In [5]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [6]:
diamonds_predict.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'],
      dtype='object')

## Pre-processing

In [7]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', RobustScaler())])

categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [8]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                ('cat', categorical_transformer, cat_features)])


In [9]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', RobustScaler())]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

## Train-test split

In [10]:
diamonds_train, diamonds_test = train_test_split(diamonds)

## Model

In [11]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [12]:
model.fit(diamonds_train[feats], diamonds_train[target]);

In [13]:
scores = cross_val_score(model, 
                         diamonds[feats], 
                         diamonds[target], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [14]:
## Optimizing

In [15]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[feats], diamonds[target])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 154 out of 160 | elapsed:  4.1min remaining:    9.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  5.0min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                RobustScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
                

In [16]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [17]:
grid_search.best_score_

-557.5225966899159

In [18]:
y_pred = grid_search.predict(diamonds_predict[feats])
submission_df = \
pd.DataFrame({'id': diamonds_predict['id'],
              'price': y_pred.clip(300, 18000)})

submission_df.to_csv('./submissions/GS.csv', index=False)

## Validation & Metrics

In [19]:
np.mean(-scores)

561.1189675924896

In [20]:
y_test = model.predict(diamonds_test[feats])
y_train = model.predict(diamonds_train[feats])

In [21]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[target], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[target], squared=False)}")

test error: 564.7055415078697
train error: 210.19695350145506


## Last training

In [22]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76


In [23]:
#model.fit(diamonds[feats], diamonds[target])

## Submission

In [24]:
y_predict = model.predict(diamonds_predict)



In [25]:
submission = pd.DataFrame({'id': diamonds_predict.id,
                          'price': y_predict},
                         )
submission

Unnamed: 0,id,price
0,0,3029.90
1,1,5379.96
2,2,9040.50
3,3,4112.32
4,4,1703.36
...,...,...
13480,13480,1891.50
13481,13481,2366.51
13482,13482,2937.13
13483,13483,2183.12


In [26]:
submission.to_csv('./submissions/RFR_Crossval.csv', index=False)