In [1]:
# Install libs
!pip install feature-engine lightgbm xgboost
!pip install --upgrade catboost
!pip uninstall scikit-learn feature-engine -y
!pip install scikit-learn==1.2.2
!pip install feature-engine==1.3.0

Collecting catboost
  Downloading catboost-1.2.7-cp39-cp39-macosx_11_0_universal2.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp39-cp39-macosx_11_0_universal2.whl (27.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.1/27.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: catboost
  Attempting uninstall: catboost
    Found existing installation: catboost 0.25.1
    Uninstalling catboost-0.25.1:
      Successfully uninstalled catboost-0.25.1
Successfully installed catboost-1.2.7
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: feature-engine 1.3.0
Uninstalling feature-engine-1.3.0:
  Successfully uninstalled feature-engine-1.3.0
Collecting scikit-learn==1.2.2
  Using cached scikit_learn-1.2.2-cp39-cp39-macosx_10_9_x86_64.whl.metadata (11 kB)
Using cached scikit_learn-1.2.2-cp39-cp39-macosx_10_9_x86_64.whl

Using cached feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
Installing collected packages: feature-engine
Successfully installed feature-engine-1.3.0


In [2]:
# Standard libs Imports
import os
import sys
import time

# Data Manipulation and Numerical libs
import numpy as np
import pandas as pd

# Data Visualization libs
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-Learn libs
from sklearn.model_selection import (
    cross_validate,
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    VarianceThreshold
)
from scipy.stats import loguniform

# Scikit-Learn Models libs
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier
)

# External Machine Learning Models libs
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Feature-Engine libs
from feature_engine.imputation import (
    ArbitraryNumberImputer,
    MeanMedianImputer,
    CategoricalImputer
)
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper

# Metrics libs
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Model libs
import joblib

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:
# Read the dataset
df_abt = pd.read_csv('/Users/dellacorte/py-projects/data-science/supervised-learning-pipeline-reference/databases/propensao_revenda_abt.csv')

# get the training base
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

# get the evaluation base (out of time)
df_oot   = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# training data
X_train = df_train[features]
y_train = df_train[target]

# test data
X_oot = df_oot[features]
y_oot = df_oot[target]

## Defining pipelines and models

In [4]:
random_state = 42

linear_models = [
    ('logistic_regression', LogisticRegression(random_state=random_state, max_iter=1000)),
    ('svm', SVC(random_state=random_state))
]

steps_linear_models = [
    ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='mean')), 
    ('numeric_scaler', SklearnTransformerWrapper(variables=num_vars, transformer=StandardScaler())), 
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
]


tree_models = [
    ('decision_tree', DecisionTreeClassifier(random_state=random_state)),
    ('random_forest', RandomForestClassifier(random_state=random_state)),
    ('gb', GradientBoostingClassifier(random_state=random_state)),
    ('xgb', XGBClassifier(random_state=random_state)),
    ('lgbm', LGBMClassifier(random_state=random_state)),
    ('catboost', CatBoostClassifier(random_state=random_state))
]

steps_tree_models = [
    ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='mean')),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    
]

## Defining the metrics

In [5]:
df_results = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
df_results

Unnamed: 0,accuracy,precision,recall,f1,roc_auc


## Automating the entire process

In [6]:
def train_model(model, steps, X_train, y_train, cv, random_state, n_jobs=-1):
    pipeline = Pipeline(steps=steps + [model])
    CV_result = cross_validate(
                            estimator=pipeline, 
                            X=X_train,
                            y=y_train,
                            scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
                            cv=cv,
                            n_jobs=-1)

    df_aux = pd.DataFrame(CV_result)
    dfar = df_aux.mean()

    return [
            dfar.loc['test_accuracy'], 
            dfar.loc['test_precision'], 
            dfar.loc['test_recall'], 
            dfar.loc['test_f1'], 
            dfar.loc['test_roc_auc']
    ]

In [7]:
%%time
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

for model in linear_models:
    model_name = model[0]
    print(f'Trainning {model_name} ...', end=' ')
    aux = train_model(model, steps_linear_models, X_train, y_train, skf, random_state)
    df_results.loc[model_name] = aux
    print('OK')

for model in tree_models:
    model_name = model[0]
    print(f'Trainning {model_name} ...', end=' ')
    aux = train_model(model, steps_tree_models, X_train, y_train, skf, random_state)
    df_results.loc[model_name] = aux
    print('OK')

Trainning logistic_regression ... 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


OK
Trainning svm ... 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


OK
Trainning decision_tree ... 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


OK
Trainning random_forest ... OK
Trainning gb ... 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


OK
Trainning xgb ... OK
Trainning lgbm ... OK
Trainning catboost ... [LightGBM] [Info] Number of positive: 1065, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 864
[LightGBM] [Info] Number of data points in the train set: 2796, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380901 -> initscore=-0.485724
[LightGBM] [Info] Start training from score -0.485724
Learning rate set to 0.015981
0:	learn: 0.6806939	total: 64ms	remaining: 1m 3s
1:	learn: 0.6694835	total: 65.6ms	remaining: 32.8s
2:	learn: 0.6575877	total: 67.1ms	remaining: 22.3s
3:	learn: 0.6472518	total: 68.7ms	remaining: 17.1s
4:	learn: 0.6380224	total: 70.6ms	remaining: 14.1s
5:	learn: 0.6281276	total: 72.6ms	remaining: 12s
6:	learn: 0.6186280	total: 75ms	remaining: 10.6s
7:	learn: 0.6089636	total: 76.9ms	remaining: 9.54s
8:	learn:

Learning rate set to 0.015981
0:	learn: 0.6818280	total: 62.3ms	remaining: 1m 2s
1:	learn: 0.6715497	total: 64.6ms	remaining: 32.2s
2:	learn: 0.6608124	total: 66ms	remaining: 21.9s
3:	learn: 0.6508268	total: 68ms	remaining: 16.9s
4:	learn: 0.6408444	total: 69.5ms	remaining: 13.8s
5:	learn: 0.6313787	total: 71.2ms	remaining: 11.8s
6:	learn: 0.6227147	total: 73ms	remaining: 10.3s
7:	learn: 0.6142639	total: 74.3ms	remaining: 9.21s
8:	learn: 0.6052867	total: 76ms	remaining: 8.37s
9:	learn: 0.5971384	total: 77.4ms	remaining: 7.67s
10:	learn: 0.5888878	total: 78.8ms	remaining: 7.08s
11:	learn: 0.5812634	total: 80.7ms	remaining: 6.64s
12:	learn: 0.5737095	total: 83ms	remaining: 6.3s
13:	learn: 0.5673936	total: 84.4ms	remaining: 5.94s
14:	learn: 0.5595742	total: 85.9ms	remaining: 5.64s
15:	learn: 0.5535240	total: 87.2ms	remaining: 5.36s
16:	learn: 0.5481366	total: 88.4ms	remaining: 5.11s
17:	learn: 0.5419552	total: 89.8ms	remaining: 4.9s
18:	learn: 0.5359492	total: 91.3ms	remaining: 4.71s
19:	

306:	learn: 0.3364707	total: 608ms	remaining: 1.37s
307:	learn: 0.3364407	total: 610ms	remaining: 1.37s
308:	learn: 0.3364075	total: 612ms	remaining: 1.37s
309:	learn: 0.3363429	total: 613ms	remaining: 1.36s
310:	learn: 0.3361840	total: 615ms	remaining: 1.36s
311:	learn: 0.3361389	total: 616ms	remaining: 1.36s
312:	learn: 0.3359778	total: 618ms	remaining: 1.35s
313:	learn: 0.3359164	total: 619ms	remaining: 1.35s
314:	learn: 0.3358573	total: 621ms	remaining: 1.35s
315:	learn: 0.3356702	total: 623ms	remaining: 1.35s
316:	learn: 0.3354590	total: 624ms	remaining: 1.34s
317:	learn: 0.3354211	total: 625ms	remaining: 1.34s
318:	learn: 0.3353267	total: 627ms	remaining: 1.34s
319:	learn: 0.3352098	total: 629ms	remaining: 1.34s
320:	learn: 0.3350626	total: 630ms	remaining: 1.33s
321:	learn: 0.3348761	total: 632ms	remaining: 1.33s
322:	learn: 0.3347986	total: 633ms	remaining: 1.33s
323:	learn: 0.3347308	total: 635ms	remaining: 1.32s
324:	learn: 0.3346159	total: 637ms	remaining: 1.32s
325:	learn: 

316:	learn: 0.3458087	total: 601ms	remaining: 1.29s
317:	learn: 0.3457371	total: 603ms	remaining: 1.29s
318:	learn: 0.3456807	total: 604ms	remaining: 1.29s
319:	learn: 0.3456120	total: 606ms	remaining: 1.29s
320:	learn: 0.3455478	total: 607ms	remaining: 1.28s
321:	learn: 0.3455066	total: 609ms	remaining: 1.28s
322:	learn: 0.3453315	total: 611ms	remaining: 1.28s
323:	learn: 0.3452775	total: 612ms	remaining: 1.28s
324:	learn: 0.3452336	total: 615ms	remaining: 1.28s
325:	learn: 0.3451253	total: 616ms	remaining: 1.27s
326:	learn: 0.3449622	total: 618ms	remaining: 1.27s
327:	learn: 0.3447692	total: 619ms	remaining: 1.27s
328:	learn: 0.3446663	total: 620ms	remaining: 1.26s
329:	learn: 0.3446343	total: 621ms	remaining: 1.26s
330:	learn: 0.3445328	total: 623ms	remaining: 1.26s
331:	learn: 0.3443920	total: 625ms	remaining: 1.26s
332:	learn: 0.3443714	total: 626ms	remaining: 1.25s
333:	learn: 0.3441635	total: 627ms	remaining: 1.25s
334:	learn: 0.3440046	total: 629ms	remaining: 1.25s
335:	learn: 

OK
CPU times: user 175 ms, sys: 200 ms, total: 375 ms
Wall time: 12.2 s
[LightGBM] [Info] Number of positive: 1066, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 866
[LightGBM] [Info] Number of data points in the train set: 2796, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381259 -> initscore=-0.484208
[LightGBM] [Info] Start training from score -0.484208
[LightGBM] [Info] Number of positive: 1065, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 871
[LightGBM] [Info] Number of data points in the train set: 2796, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380901 -> initscore=-0.485724
[L

620:	learn: 0.2990842	total: 1.18s	remaining: 720ms
621:	learn: 0.2990241	total: 1.18s	remaining: 719ms
622:	learn: 0.2989174	total: 1.18s	remaining: 717ms
623:	learn: 0.2986236	total: 1.19s	remaining: 715ms
624:	learn: 0.2985167	total: 1.19s	remaining: 713ms
625:	learn: 0.2983903	total: 1.19s	remaining: 711ms
626:	learn: 0.2982246	total: 1.19s	remaining: 709ms
627:	learn: 0.2980257	total: 1.19s	remaining: 707ms
628:	learn: 0.2976983	total: 1.2s	remaining: 705ms
629:	learn: 0.2975439	total: 1.2s	remaining: 704ms
630:	learn: 0.2975241	total: 1.2s	remaining: 701ms
631:	learn: 0.2975007	total: 1.2s	remaining: 699ms
632:	learn: 0.2974583	total: 1.2s	remaining: 697ms
633:	learn: 0.2973928	total: 1.2s	remaining: 695ms
634:	learn: 0.2973498	total: 1.21s	remaining: 693ms
635:	learn: 0.2971193	total: 1.21s	remaining: 691ms
636:	learn: 0.2970602	total: 1.21s	remaining: 689ms
637:	learn: 0.2969428	total: 1.21s	remaining: 687ms
638:	learn: 0.2967422	total: 1.21s	remaining: 685ms
639:	learn: 0.2966

619:	learn: 0.2951207	total: 1.19s	remaining: 728ms
620:	learn: 0.2950452	total: 1.19s	remaining: 726ms
621:	learn: 0.2948541	total: 1.19s	remaining: 725ms
622:	learn: 0.2947092	total: 1.19s	remaining: 723ms
623:	learn: 0.2944981	total: 1.2s	remaining: 721ms
624:	learn: 0.2943691	total: 1.2s	remaining: 719ms
625:	learn: 0.2942607	total: 1.2s	remaining: 717ms
626:	learn: 0.2941253	total: 1.2s	remaining: 715ms
627:	learn: 0.2940157	total: 1.2s	remaining: 713ms
628:	learn: 0.2939210	total: 1.21s	remaining: 711ms
629:	learn: 0.2939040	total: 1.21s	remaining: 709ms
630:	learn: 0.2936253	total: 1.21s	remaining: 707ms
631:	learn: 0.2935578	total: 1.21s	remaining: 705ms
632:	learn: 0.2933851	total: 1.21s	remaining: 703ms
633:	learn: 0.2931804	total: 1.21s	remaining: 701ms
634:	learn: 0.2931407	total: 1.22s	remaining: 699ms
635:	learn: 0.2930853	total: 1.22s	remaining: 697ms
636:	learn: 0.2929643	total: 1.22s	remaining: 696ms
637:	learn: 0.2927448	total: 1.22s	remaining: 694ms
638:	learn: 0.292

630:	learn: 0.2978196	total: 1.22s	remaining: 715ms
631:	learn: 0.2976557	total: 1.22s	remaining: 713ms
632:	learn: 0.2975294	total: 1.23s	remaining: 711ms
633:	learn: 0.2974357	total: 1.23s	remaining: 709ms
634:	learn: 0.2972868	total: 1.23s	remaining: 707ms
635:	learn: 0.2971281	total: 1.23s	remaining: 705ms
636:	learn: 0.2968636	total: 1.23s	remaining: 703ms
637:	learn: 0.2965755	total: 1.24s	remaining: 701ms
638:	learn: 0.2962559	total: 1.24s	remaining: 699ms
639:	learn: 0.2960130	total: 1.24s	remaining: 697ms
640:	learn: 0.2957957	total: 1.24s	remaining: 695ms
641:	learn: 0.2956355	total: 1.25s	remaining: 696ms
642:	learn: 0.2954742	total: 1.25s	remaining: 693ms
643:	learn: 0.2954373	total: 1.25s	remaining: 691ms
644:	learn: 0.2952909	total: 1.25s	remaining: 690ms
645:	learn: 0.2951249	total: 1.25s	remaining: 688ms
646:	learn: 0.2949641	total: 1.26s	remaining: 685ms
647:	learn: 0.2948215	total: 1.26s	remaining: 683ms
648:	learn: 0.2946157	total: 1.26s	remaining: 681ms
649:	learn: 

In [8]:
df_results.sort_values(by='roc_auc', ascending=False)

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
catboost,0.844635,0.808709,0.776278,0.791764,0.913226
random_forest,0.836624,0.798771,0.76353,0.780582,0.912172
xgb,0.840629,0.799122,0.777762,0.787895,0.91018
lgbm,0.835479,0.791366,0.772535,0.781464,0.907614
gb,0.832904,0.789713,0.765771,0.777198,0.907262
logistic_regression,0.825179,0.850291,0.65692,0.741007,0.898444
svm,0.829757,0.82212,0.707209,0.759838,0.891748
decision_tree,0.776252,0.703995,0.713213,0.708302,0.764132
