# SCIKIT-LEARN MACHINE LEARNING PIPELINE

## Data loading

In [1]:
import pandas as pd

In [2]:
diamonds = pd.read_csv('../data/raw/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/raw/diamonds_predict.csv')

In [3]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
diamonds.shape

(40455, 10)

In [5]:
diamonds_predict.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'],
      dtype='object')

In [6]:
diamonds_predict.shape

(13485, 10)

### Identifiying features

In [7]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   price    40455 non-null  int64  
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.1+ MB


In [8]:
diamonds_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  object 
 3   color    13485 non-null  object 
 4   clarity  13485 non-null  object 
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 1.0+ MB


In [9]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

## Machine Learning preprocessing

In [10]:
from sklearn.pipeline import Pipeline

#### Preprocessing numerical features

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

In [12]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', RobustScaler())])

#### Preprocessing categorical features

In [13]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [14]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#### building the full preprocessor

In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

#### taking a look the interpretability of preprocessor about the transformed DataFrame...

In [17]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.796875,0.4,0.333333,0.622951,0.593407,0.648889,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.59375,0.8,0.0,-0.73224,-0.730769,-0.684444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.015625,2.466667,-0.666667,-0.038251,-0.098901,0.115556,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.453125,1.333333,-0.333333,-0.551913,-0.543956,-0.462222,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.5,-0.866667,0.666667,0.469945,0.43956,0.382222,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Training a simple model

#### split the dataset with price

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [20]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 10)
(10114, 10)


#### choosing a model

In [21]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [22]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor(n_jobs=2))])

In [23]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

## Check model performance

#### Using RMSE

In [24]:
from sklearn.metrics import mean_squared_error

In [25]:
y_train = model.predict(diamonds_train[FEATS])
y_test = model.predict(diamonds_test[FEATS])

In [26]:
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")

train error: 207.9505496702721
test error: 567.9110948900922


#### Using Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=2)

In [29]:
import numpy as np
np.mean(-scores)

561.6479608719604

## Optimize model using grid search

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [31]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
    'regressor__min_samples_split': [64, 128, 256, 512],
    
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=2,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    2.2s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   29.3s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   46.5s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   55.6s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:  3.3min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:  3.6min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed:  4.1min
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  4.7min
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed:  5.9min
[Parallel(

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer()),
                                                                                               ('scaler',
                                                                                                RobustScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
                                 

In [32]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__min_samples_split': 64,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [33]:
grid_search.best_score_

-625.7272086583689

## Submission

In [35]:
y_pred = grid_search.predict(diamonds_predict[FEATS]).clip(300,19000)

In [36]:
submission_df = pd.DataFrame({'id': diamonds_predict.id, 'price': y_pred})

In [37]:
submission_df.price.describe()

count    13485.000000
mean      3951.317449
std       3926.316083
min        405.141324
25%        940.418024
50%       2480.894637
75%       5313.502132
max      17544.693475
Name: price, dtype: float64

In [38]:
submission_df.to_csv('submission_pipeline_GBR.csv', index=False)