In [5]:
# Import libs
import os
import time
import pandas as pd

# Pipeline lib
from sklearn.pipeline import Pipeline

# feature-engine libs
from feature_engine.encoding import OneHotEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

# machine learning models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils.fixes import loguniform
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# trainning models
from sklearn.model_selection import GridSearchCV

In [6]:
# Read the dataset
df_abt = pd.read_csv('/Users/dellacorte/py-projects/data-science/supervised-learning-pipeline-reference/databases/propensao_revenda_abt.csv')

# pega a base de treinamento
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

# pega a base de avaliação (out of time)
df_oot   = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

In [7]:
datapipe = [
            ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
            ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
            ('one_hot_encoder', OneHotEncoder(variables=cat_vars))
]

### Decision Tree

The main hyperparameters that we can use in GridSearch:

* `max_depth`: Depth of the tree. The default value will be to build the tree until the leaves contain less than the value defined in `min_samples_split`. Possible values: `2, 3, 4, 5, 6, 7, ...`.
* `criterion`: Tree separation function. Possible values: `gini` and `entropy`.
* `class_weight`: weights of classes in a dictionary. The `balanced` value will define values ​​in inverse proportion to the class frequencies. Possible values: `balanced` and `None`.
* `min_samples_split`: Minimum number of samples required to split a node. Default value is 2.

In [8]:
t1 = time.time()

pipeline = Pipeline(steps=datapipe + [('decision_tree', DecisionTreeClassifier())])

parametros = {
  'decision_tree__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  'decision_tree__criterion': ['gini', 'entropy'],
  'decision_tree__class_weight': ['balanced', None],
  'decision_tree__min_samples_split': [2],
}

grid_search = GridSearchCV(pipeline, parametros, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print()
print('='*100)
print(grid_search.best_params_)
print('\nIt took {} seconds for GridSearch to Decision Tree'.format(time.time() - t1))
print('='*100)
print()

Fitting 5 folds for each of 40 candidates, totalling 200 fits

{'decision_tree__class_weight': None, 'decision_tree__criterion': 'entropy', 'decision_tree__max_depth': 4, 'decision_tree__min_samples_split': 2}

It took 3.8964040279388428 seconds for GridSearch to Decision Tree



### Random Forest

The main hyperparameters that we can use in GridSearch:

* `n_estimators`: Random number of trees in the forest. Possible values: `120, 300, 500, 800, 1200`.
* `max_depth`: Depth of the tree. The default value will be to build the tree until the leaves contain less than the value defined in `min_samples_split`. Possible values: `5, 8, 15, 25, 30, None`.
* `max_features`: Number of attributes (characteristics) to analyze in the separation. The default is all (`auto`).
Possible values: `log2, sqrt, None`.
* `min_samples_split`: Minimum number of samples required to split a node. Default value is 2. Possible values: `2, 5, 10, 15, 100`.

* `min_samples_leaf`: Minimum number of samples in each leaf. Default value is 1. Possible values: `1, 2, 5, 10`.

* `class_weight`: weights of classes in a dictionary. The `balanced` value will define values ​​in inverse proportion to the class frequencies. Possible values: `balanced` and `None`.