In [2]:
import os
import numpy as np
import pandas as pd

np.seterr(all='ignore')
np.random.RandomState(seed=11)

from dotenv import load_dotenv
from typing import Iterable, TypeVar
from pycaret.classification import ClassificationExperiment, tune_model

pd.set_option('display.max_columns', None)

In [None]:
# Loading environment
load_dotenv(verbose=False)

In [4]:
# Consider including the location of the dataset in you pc or any cloud storage service
Xy_train = pd.read_csv(os.getenv('DATA_MODEL_TRAIN'), sep=';')
X_test = pd.read_csv(os.getenv('DATA_MODEL_TEST'), sep=';')

### Workflow for Classification Model using PyCaret
---
We aim to develop a classification model to predict a target class based on various features in a dataset (see the step 1). The `PyCaret` library, known for simplifying machine learning workflows in Python, will be used to streamline the modeling process and enable quick experimentation. This project will include the following steps:

<input type="checkbox" checked> Data imputation, fixing and transformation

<input type="checkbox" checked> Feature engineering using `feature_engine`

<input type="checkbox" checked> Save final `train` and `test` datasets 👉🏾 *to step 2...*

In [5]:
# Columns with order
order_keys = ['fami_educacionmadre', 'fami_educacionpadre', 'fami_cuartoshogar', 'fami_personashogar', 'fami_estratovivienda']

order_values = [
    [
        'postgrado', 'educación profesional completa', 'técnica o tecnológica completa', 'educación profesional incompleta', 
        'técnica o tecnológica incompleta', 'secundaria (bachillerato) completa', 'secundaria (bachillerato) incompleta', 
        'primaria completa', 'primaria incompleta', 'ninguno', 'no sabe', 'no aplica'
    ],
    [
        'postgrado', 'educación profesional completa', 'técnica o tecnológica completa', 'educación profesional incompleta', 
        'técnica o tecnológica incompleta', 'secundaria (bachillerato) completa', 'secundaria (bachillerato) incompleta', 
        'primaria completa', 'primaria incompleta', 'ninguno', 'no sabe', 'no aplica'
    ],
    ['seis o mas', 'cinco', 'cuatro', 'tres', 'dos', 'uno'],
    ['9 o más', '7 a 8', '5 a 6', '3 a 4', '1 a 2'],
    ['estrato 6', 'estrato 5', 'estrato 4', 'estrato 3', 'estrato 2', 'estrato 1', 'sin estrato']
]

ordinal_features = {key: val for key, val in zip(order_keys, order_values)}

In [None]:
experiment = ClassificationExperiment()

experiment.setup(
    Xy_train,
    target='target',
    ordinal_features=ordinal_features,
    preprocess=True,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.90,
    train_size=0.70,
    fold=20,
    verbose=True
)

In [None]:
cols_selected = set(experiment.get_config('dataset_transformed').columns)
cols_removed = set(Xy_train.columns) - set(cols_selected)

print('columnas seleccionadas:', cols_selected)
print('columnas removidas:', cols_removed)

In [None]:
best_model = experiment.compare_models(errors='raise', sort='F1')

In [None]:
model_gbc = experiment.create_model('gbc')

In [None]:
experiment.plot_model(model_gbc, 'auc')
print('='*80)
experiment.plot_model(model_gbc, 'feature')

In [None]:
experiment.evaluate_model(model_gbc)

In [None]:
tuned_model = experiment.tune_model(model_gbc, n_iter=20)

In [None]:
experiment.evaluate_model(tuned_model)

In [None]:
# Save the model
# experiment.save_model(tuned_model, 'model/tuned_gradient_boosting_classifier.gz')

In [None]:
threshold = 0.44

pred_test = experiment.predict_model(
    tuned_model,
    X_test,
    probability_threshold=threshold,
    raw_score=True,
)

In [25]:
pred_test_final = pred_test[['estu_id','prediction_label']]
pred_test_final.rename(columns={'prediction_label':'target'}, inplace=True)
pred_test_final.to_csv('out/out.csv', sep=',', index=False)