In [1]:
# installing libs
!pip install --force-reinstall numpy pandas
!pip install numpy==1.21 pandas==1.3
#!pip install feature-engine

Collecting numpy
  Using cached numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl (21.2 MB)
Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl (12.6 MB)
Collecting tzdata>=2022.7
  Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Collecting pytz>=2020.1
  Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Collecting python-dateutil>=2.8.2
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Collecting six>=1.5
  Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: pytz, tzdata, six, numpy, python-dateutil, pandas
  Attempting uninstall: pytz
    Found existing installation: pytz 2024.2
    Uninstalling pytz-2024.2:
      Successfully uninstalled pytz-2024.2
  Attempting uninstall: tzdata
    Found existing installation: tzdata 2024.2
    Uninstalling tzdata-2024.2:
      Successfully uninstalled tzdata-2024.2
  Attempting uninstall: six
    Found existing installation: six 1.17.0
    Uninstalling six-

In [9]:
# importing libs
import os
import pandas as pd

# pipeline libs
from sklearn.pipeline import Pipeline

# feature-engine libs
from feature_engine.encoding import OneHotEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

# trainning models
from sklearn.model_selection import GridSearchCV

# machine learning libs
from sklearn.tree import DecisionTreeClassifier

# machine learning metrics
from sklearn.metrics import roc_auc_score

In [3]:
# Get the current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/dellacorte/py-projects/data-science/supervised-learning-pipeline-reference/classification-II


In [4]:
# Read the dataset
df_abt = pd.read_csv('/Users/dellacorte/py-projects/data-science/supervised-learning-pipeline-reference/databases/propensao_revenda_abt.csv')
df_abt.head() 

# get the training base
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

# get the evaluation base (out of time)
df_oot = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# training data
X_train = df_train[features]
y_train = df_train[target]

# evaluation data (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

In [5]:
dt = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('Decision_Tree', DecisionTreeClassifier(random_state=42,)) 
])

In [6]:
dt.fit(X_train, y_train)

Pipeline(steps=[('numeric_imputer',
                 ArbitraryNumberImputer(arbitrary_number=-999,
                                        variables=['tot_orders_12m',
                                                   'tot_items_12m',
                                                   'tot_items_dist_12m',
                                                   'receita_12m',
                                                   'recencia'])),
                ('categoric_imputer',
                 CategoricalImputer(return_object=True, variables=['uf'])),
                ('one_hot_encoder', OneHotEncoder(variables=['uf'])),
                ('Decision_Tree', DecisionTreeClassifier(random_state=42))])

In [7]:
print('ROC_AUC for training with default parameters:', roc_auc_score(y_train, dt.predict(X_train)))
print('ROC_AUC for oot with default parameters:', roc_auc_score(y_oot, dt.predict(X_oot)))

ROC_AUC for training with default parameters: 0.9996246246246246
ROC_AUC for oot with default parameters: 0.7831532768042844


In [10]:
parameters = {
    'Decision_Tree__max_depth': [None, 3, 4, 5, 6, 7, 9, 11],
    'Decision_Tree__criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(estimator=dt, param_grid=paramerts, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

NameError: name 'paramerts' is not defined