In [19]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE


Importamos los datos:

In [20]:
X_test = pd.read_csv("../data/processed/X_test.csv")
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

In [21]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

Primera versión (con parámetros en default):

In [22]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

# Evaluación -------------------------------------------------------------------------------------------

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))

MATRIZ DE CONFUSIÓN:
[[1541   52]
 [ 235  172]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.87      0.97      0.91      1593
           1       0.77      0.42      0.55       407

    accuracy                           0.86      2000
   macro avg       0.82      0.69      0.73      2000
weighted avg       0.85      0.86      0.84      2000

AUC-ROC: 0.8486


Segunda versión (Con balanceo de clases):

In [24]:
# Calcular el peso según la proporción de clases

from collections import Counter
counter = Counter(y_train)
scale = counter[0] / counter[1]
print(scale)


# Usando SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


# Entrenamiento del modelo ----------------------------------------------------------------------------

rf_res = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_res.fit(X_train_res, y_train_res)

# Evaluación -------------------------------------------------------------------------------------------

y_pred = rf_res.predict(X_test)
y_proba = rf_res.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))

3.9079754601226995
MATRIZ DE CONFUSIÓN:
[[1448  145]
 [ 171  236]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1593
           1       0.62      0.58      0.60       407

    accuracy                           0.84      2000
   macro avg       0.76      0.74      0.75      2000
weighted avg       0.84      0.84      0.84      2000

AUC-ROC: 0.8441


Tercera Versión (Cross-Validation): 

In [25]:
# Cross Validation
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

param_dist_rf = {
    'n_estimators': randint(200, 600),
    'max_depth': randint(4, 12),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 6),
    'max_features': ['sqrt', 'log2']
}

search_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf, n_iter=30,
    scoring='roc_auc', cv=5, random_state=42, n_jobs=-1, verbose=1
)

search_rf.fit(X_train_res, y_train_res)

print("Mejor Random Forest:")
print(search_rf.best_params_)
print("AUC en CV:", search_rf.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Mejor Random Forest:
{'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 589}
AUC en CV: 0.9411757674933275


In [27]:
# Entrenar el mejor modelo con todos los datos balanceados
best_rf = search_rf.best_estimator_
best_rf.fit(X_train_res, y_train_res)

# Evaluación -------------------------------------------------------------------------------------------
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))

MATRIZ DE CONFUSIÓN:
[[1394  199]
 [ 140  267]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1593
           1       0.57      0.66      0.61       407

    accuracy                           0.83      2000
   macro avg       0.74      0.77      0.75      2000
weighted avg       0.84      0.83      0.83      2000

AUC-ROC: 0.8523
