# SCIKIT-LEARN MACHINE LEARNING PIPELINE

## Data loading

In [1]:
import pandas as pd

In [2]:
diamonds = pd.read_csv('../data/raw/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/raw/diamonds_predict.csv')

In [3]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']

In [5]:
diamonds.shape

(40455, 11)

In [6]:
diamonds_predict.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'],
      dtype='object')

In [7]:
diamonds_predict['volume'] = diamonds_predict['x']*diamonds_predict['y']*diamonds_predict['z']

In [8]:
diamonds_predict.shape

(13485, 11)

### Identifiying features

In [9]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   price    40455 non-null  int64  
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  volume   40455 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 3.4+ MB


In [10]:
diamonds.corr()

Unnamed: 0,carat,depth,table,price,x,y,z,volume
carat,1.0,0.026528,0.183392,0.921935,0.975688,0.951667,0.96757,0.971851
depth,0.026528,1.0,-0.293114,-0.014864,-0.026348,-0.030966,0.094655,0.006721
table,0.183392,-0.293114,1.0,0.130111,0.196059,0.184673,0.155189,0.168437
price,0.921935,-0.014864,0.130111,1.0,0.885848,0.866163,0.8745,0.898684
x,0.975688,-0.026348,0.196059,0.885848,1.0,0.973712,0.984876,0.952303
y,0.951667,-0.030966,0.184673,0.866163,0.973712,1.0,0.964828,0.97786
z,0.96757,0.094655,0.155189,0.8745,0.984876,0.964828,1.0,0.953983
volume,0.971851,0.006721,0.168437,0.898684,0.952303,0.97786,0.953983,1.0


In [12]:
diamonds['cut']=diamonds['cut'].map({'Ideal':1,'Good':2,'Very Good':3,'Fair':4,'Premium':5})
diamonds['color']=diamonds['color'].map({'E':1,'D':2,'F':3,'G':4,'H':5,'I':6,'J':7})
diamonds['clarity']=diamonds['clarity'].map({'VVS1':1,'IF':2,'VVS2':3,'VS1':4,'I1':5,'VS2':6,'SI1':7,'SI2':8})

In [13]:
diamonds['cut/wt']=diamonds['cut']/diamonds['carat']
diamonds['color/wt']=diamonds['color']/diamonds['carat']
diamonds['clarity/wt']=diamonds['clarity']/diamonds['carat']
diamonds = diamonds.drop(['cut','color','clarity','table','depth'], axis=1)

In [14]:
diamonds.head()

Unnamed: 0,carat,price,x,y,z,volume,cut/wt,color/wt,clarity/wt
0,1.21,4268,6.83,6.79,4.25,197.096725,4.132231,5.785124,4.958678
1,0.32,505,4.35,4.38,2.75,52.39575,9.375,15.625,18.75
2,0.71,2686,5.62,5.53,3.65,113.43689,5.633803,5.633803,5.633803
3,0.41,738,4.68,4.72,3.0,66.2688,4.878049,4.878049,17.073171
4,1.02,4882,6.55,6.51,3.95,168.429975,0.980392,3.921569,6.862745


In [10]:
diamonds_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  object 
 3   color    13485 non-null  object 
 4   clarity  13485 non-null  object 
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
 10  volume   13485 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 1.1+ MB


In [15]:
diamonds_predict['cut']=diamonds_predict['cut'].map({'Ideal':1,'Good':2,'Very Good':3,'Fair':4,'Premium':5})
diamonds_predict['color']=diamonds_predict['color'].map({'E':1,'D':2,'F':3,'G':4,'H':5,'I':6,'J':7})
diamonds_predict['clarity']=diamonds_predict['clarity'].map({'VVS1':1,'IF':2,'VVS2':3,'VS1':4,'I1':5,'VS2':6,'SI1':7,'SI2':8})

In [16]:
diamonds_predict['cut/wt']=diamonds_predict['cut']/diamonds_predict['carat']
diamonds_predict['color/wt']=diamonds_predict['color']/diamonds_predict['carat']
diamonds_predict['clarity/wt']=diamonds_predict['clarity']/diamonds_predict['carat']
diamonds_predict = diamonds_predict.drop(['cut','color','clarity','table','depth'], axis=1)

In [17]:
diamonds_predict.head()

Unnamed: 0,id,carat,x,y,z,volume,cut/wt,color/wt,clarity/wt
0,0,0.79,5.82,5.89,3.67,125.806866,3.797468,3.797468,8.860759
1,1,1.2,6.81,6.89,4.18,196.129362,0.833333,5.833333,3.333333
2,2,1.57,7.38,7.32,4.57,246.878712,3.184713,3.184713,4.458599
3,3,0.9,6.09,6.13,3.9,145.59363,3.333333,3.333333,7.777778
4,4,0.5,5.05,5.09,3.19,81.997355,6.0,6.0,8.0


In [18]:
FEATS = ['carat', 'x', 'y', 'z', 'volume', 'cut/wt', 'color/wt', 'clarity/wt']
#CAT_FEATS = ['cut', 'color', 'clarity']
#FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

## Machine Learning preprocessing

In [19]:
from sklearn.pipeline import Pipeline

#### Preprocessing numerical features

In [14]:
#from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import StandardScaler

In [15]:
#numeric_transformer = \
#Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
#                ('scaler', StandardScaler())])

#### Preprocessing categorical features

In [16]:
#from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [17]:
#categorical_transformer = \
#Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#### building the full preprocessor

In [18]:
#from sklearn.compose import ColumnTransformer

In [19]:
#preprocessor = \
#ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
#                                ('cat', categorical_transformer, CAT_FEATS)])

#### taking a look the interpretability of preprocessor about the transformed DataFrame...

In [20]:
#pd.DataFrame(data=preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.852876,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,-0.981034,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,-0.207411,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,-0.805209,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.489559,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Training a simple model

#### split the dataset with price

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [22]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 9)
(10114, 9)


#### choosing a model

In [23]:
from sklearn.base import TransformerMixin
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge

In [24]:
class RidgeTransformer(Ridge, TransformerMixin):

    def transform(self, X, *_):
        return self.predict(X).reshape(len(X), -1)


class RandomForestTransformer(RandomForestRegressor, TransformerMixin):

    def transform(self, X, *_):
        return self.predict(X).reshape(len(X), -1)


class KNeighborsTransformer(KNeighborsRegressor, TransformerMixin):

    def transform(self, X, *_):
        return self.predict(X).reshape(len(X), -1)

In [25]:
def build_model():
    ridge_transformer = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('poly_feats', PolynomialFeatures()),
        ('ridge', RidgeTransformer())
    ])

    pred_union = FeatureUnion(
        transformer_list=[
            ('ridge', ridge_transformer),
            ('rand_forest', RandomForestTransformer()),
            ('knn', KNeighborsTransformer())
        ],
        n_jobs=2
    )

    model = Pipeline(steps=[
        ('pred_union', pred_union),
        ('lin_regr', LinearRegression())
    ])

    return model

In [27]:
model = build_model()
model.fit(diamonds[FEATS], diamonds[TARGET])

Pipeline(steps=[('pred_union',
                 FeatureUnion(n_jobs=2,
                              transformer_list=[('ridge',
                                                 Pipeline(steps=[('scaler',
                                                                  StandardScaler()),
                                                                 ('poly_feats',
                                                                  PolynomialFeatures()),
                                                                 ('ridge',
                                                                  RidgeTransformer())])),
                                                ('rand_forest',
                                                 RandomForestTransformer()),
                                                ('knn',
                                                 KNeighborsTransformer())])),
                ('lin_regr', LinearRegression())])

## Check model performance

#### Using RMSE

In [31]:
from sklearn.metrics import mean_squared_error

In [40]:
y_train = model.predict(diamonds_train[FEATS])
y_test = model.predict(diamonds_test[FEATS])

In [41]:
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")

train error: 200.76633771895357
test error: 197.16256963484668


#### Using Cross Validation

In [42]:
from sklearn.model_selection import cross_val_score

In [43]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=2)

In [44]:
import numpy as np
np.mean(-scores)

1570.444105105716

## Optimize model using grid search

In [49]:
from sklearn.model_selection import RandomizedSearchCV

In [53]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
    'regressor__min_samples_split': [64, 128, 256, 512],
    
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=2,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

TypeError: __init__() missing 1 required positional argument: 'param_distributions'

In [33]:
grid_search.best_params_

{'regressor__n_estimators': 64,
 'regressor__min_samples_split': 64,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [34]:
grid_search.best_score_

-626.839971884933

## Submission

In [45]:
y_pred = model.predict(diamonds_predict[FEATS]).clip(300,19000)

In [46]:
submission_df = pd.DataFrame({'id': diamonds_predict.id, 'price': y_pred})

In [47]:
submission_df.price.describe()

count    13485.000000
mean      3951.422517
std       3966.449110
min        300.000000
25%        957.240361
50%       2461.368284
75%       5292.212169
max      19000.000000
Name: price, dtype: float64

In [48]:
submission_df.to_csv('submission_metamodel.csv', index=False)