## 0. python imports & setup

for learning purposes, libraries will be imported inside its corresponding usage section...

## 1. data loading

In [61]:
import pandas as pd

* diamonds: labeled data we can use for training and testing
* diamonds_predict: diamonds to predict its price and upload result to Kaggle

In [62]:
diamonds = pd.read_csv('../data/diamonds_train_ml.csv')
diamonds_predict = pd.read_csv('../data/diamonds_test.csv')

In [63]:
diamonds.head().T

Unnamed: 0,0,1,2,3,4
carat,1.21,0.32,0.71,0.41,1.02
cut,Premium,Very Good,Fair,Good,Ideal
color,J,H,G,D,G
clarity,VS2,VS2,VS1,SI1,SI1
depth,62.4,63,65.5,63.8,60.5
table,58,57,55,56,59
price,4268,505,2686,738,4882
x,6.83,4.35,5.62,4.68,6.55
y,6.79,4.38,5.53,4.72,6.51
z,4.25,2.75,3.65,3,3.95


as you can see, there are both categorical and numerical columns...

## 2. eda

Create some variables based on their correlation

In [185]:
diamonds['L/W'] = diamonds.x / diamonds.y

diamonds['volume'] = diamonds.x * diamonds.y * diamonds.z
diamonds['volume']= diamonds.volume.mask(diamonds.volume == 0, diamonds.volume.mean())

diamonds["density"] = diamonds["carat"] * 0.2 / diamonds["volume"]
diamonds['density']= diamonds.density.mask(diamonds.density == 0, diamonds.density.mean())

0.0001041504457771871

In [205]:
diamonds_predict['L/W'] = diamonds_predict.x / diamonds_predict.y

diamonds_predict['volume'] = diamonds_predict.x * diamonds_predict.y * diamonds_predict.z
diamonds_predict['volume']= diamonds_predict.volume.mask(diamonds_predict.volume == 0, diamonds_predict.volume.mean())

diamonds_predict["density"] = diamonds_predict["carat"] * 0.2 / diamonds_predict["volume"]
diamonds_predict['density']= diamonds_predict.density.mask(diamonds_predict.density == 0, diamonds_predict.density.mean())

## 3. ml preprocessing

in this section I will teach how to use scikit-learn's Pipiline and ColumnTransformer, one of the best practices for composing preprocessing and modeling in a single and elegand class... pay attention as it is hard to understand...

In [186]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
* https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

let's identify numerical and categorical features...

In [187]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z', 'L/W','density', 'volume']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

let's define a preprocessing transformer for numerical columns...

In [188]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

let's define a preprocessing transformer for categorical columns...

In [189]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

let's join these transformers using a `ColumnTransformer`:

In [190]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

inspecting the full preprocessor:

at least in this case, it is at the cost of interpretability of transformed DataFrame...

In [191]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.531739,0.04354,0.852682,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,-0.507341,-0.096849,-0.98221,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.378628,0.573531,-0.208173,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,-0.639894,0.25491,-0.806292,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.552404,-0.324271,0.489171,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## 4. train a simple model

first, lets train a simple model using holdout, train - test split...

In [192]:
from sklearn.model_selection import train_test_split

In [193]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [194]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 13)
(10114, 13)


let's choose a model from scikit-learn cheatsheet: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [74]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

rfr_model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [75]:
rfr_model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

### 4.1 train a LightGBM model

In [109]:
from lightgbm import LGBMRegressor

lgbm_model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor())])

In [110]:
lgbm_model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

### 4.2 train a SVM model

In [18]:
from sklearn.svm import SVR

svr_model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', SVR())])

In [19]:
svr_model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

### 4.3 train a combined model

In [200]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor

r1 = LGBMRegressor()
r2 = RandomForestRegressor(n_estimators=512, max_depth = 16)

combined_model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', VotingRegressor([('lgbm', r1), ('rf', r2)]))])

In [201]:
combined_model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

## 5. check model performance on test and train data

In [76]:
from sklearn.metrics import mean_squared_error

In [77]:
y_test = rfr_model.predict(diamonds_test[FEATS])
y_train = rfr_model.predict(diamonds_train[FEATS])

In [78]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")

test error: 563.6640386906666
train error: 210.1192581069525


## 6. check model performance using cross validation

In [197]:
from sklearn.model_selection import cross_val_score

In [202]:
scores = cross_val_score(combined_model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [203]:
import numpy as np
np.mean(-scores)

534.5354034579295

## 7. optimize model using grid search

In [88]:
from sklearn.model_selection import RandomizedSearchCV

In [204]:
lgbm_param_grid = {'regressor__num_leaves': (20, 100),
                   'regressor__n_estimators': (20, 500),
                   'regressor__learning_rate': (0.05, 0.3),
                  'regressor__feature_fraction': (0.1, 0.9),
                  'regressor__bagging_fraction': (0.8, 1),
                  'regressor__max_depth': (15, 25),
                  'regressor__min_split_gain': (0.001, 0.1),
                  'regressor__min_child_weight': (10, 50),
                'regressor__preprocessor__num__imputer__strategy': ['mean', 'median']}

rfr_param_grid = {'regressor__n_estimators': [512],
                 'regressor__max_depth': [16],
                 'preprocessor__num__imputer__strategy': ['mean']}

combined_param_grid = {
                        #'regressor__lgbm__num_leaves': (20, 100),
                        #'regressor__lgbm__n_estimators': (20, 500),
                        #'regressor__lgbm__learning_rate': (0.05, 0.3),
                        #'regressor__lgbm__feature_fraction': (0.1, 0.9),
                        #'regressor__lgbm__bagging_fraction': (0.8, 1),
                        #'regressor__lgbm__max_depth': (15, 25),
                        #'regressor__lgbm__min_split_gain': (0.001, 0.1),
                        #'regressor__lgbm__min_child_weight': (10, 50),
                        #'regressor__rf__n_estimators': [512],
                        #'regressor__rf__max_depth': [16],
                        'preprocessor__num__imputer__strategy': ['mean']
}


grid_search = RandomizedSearchCV(combined_model, 
                                 combined_param_grid, 
                                 cv=5, 
                                 verbose=2, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=10)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........preprocessor__num__imputer__strategy=mean; total time= 1.7min
[CV] END ..........preprocessor__num__imputer__strategy=mean; total time= 1.9min
[CV] END ..........preprocessor__num__imputer__strategy=mean; total time= 1.6min
[CV] END ..........preprocessor__num__imputer__strategy=mean; total time= 1.6min
[CV] END ..........preprocessor__num__imputer__strategy=mean; total time= 1.7min


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [206]:
grid_search.best_params_

{'preprocessor__num__imputer__strategy': 'mean'}

In [207]:
grid_search.best_score_

-534.2560032819135

## 8. Prepare submission

In [208]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [209]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [210]:
submission_df.head()

Unnamed: 0,id,price
0,0,2907.639331
1,1,5632.615022
2,2,9471.212337
3,3,4055.583619
4,4,1639.378447


In [211]:
submission_df.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3952.955395
std,3892.928525,3945.681812
min,0.0,339.102748
25%,3371.0,945.682904
50%,6742.0,2466.956301
75%,10113.0,5304.791594
max,13484.0,17899.397882


In [212]:
submission_df.price.clip(0, 20000, inplace=True)

In [213]:
submission_df.to_csv('../submissions_kaggle//submission_combined_3.csv', index=False)