### Imported libreries

In [1]:
# Data manipulation

import pandas as pd
import numpy as np

# Modeling

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt


# Warning manage

import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="Inconsistent values: penalty=l1 with l1_ratio=0.0")

### Selection of the best dataset

In [2]:
X_train_WITH_outliers = pd.read_excel("../data/processed/X_train_WITH_outliers.xlsx")
X_train_WITH_outliers_norm = pd.read_excel("../data/processed/X_train_WITH_outliers_norm.xlsx")
X_train_WITH_outliers_scal = pd.read_excel("../data/processed/X_train_WITH_outliers_scal.xlsx")
X_train_WITHOUT_outliers = pd.read_excel("../data/processed/X_train_WITHOUT_outliers.xlsx")
X_train_WITHOUT_outliers_norm = pd.read_excel("../data/processed/X_train_WITHOUT_outliers_norm.xlsx")
X_train_WITHOUT_outliers_scal = pd.read_excel("../data/processed/X_train_WITHOUT_outliers_scal.xlsx")

X_test_WITH_outliers = pd.read_excel("../data/processed/X_test_WITH_outliers.xlsx")
X_test_WITH_outliers_norm = pd.read_excel("../data/processed/X_test_WITH_outliers_norm.xlsx")
X_test_WITH_outliers_scal = pd.read_excel("../data/processed/X_test_WITH_outliers_scal.xlsx")
X_test_WITHOUT_outliers = pd.read_excel("../data/processed/X_test_WITHOUT_outliers.xlsx")
X_test_WITHOUT_outliers_norm = pd.read_excel("../data/processed/X_test_WITHOUT_outliers_norm.xlsx")
X_test_WITHOUT_outliers_scal = pd.read_excel("../data/processed/X_test_WITHOUT_outliers_scal.xlsx")


y_train = pd.read_excel("../data/processed/y_train.xlsx")
y_test = pd.read_excel("../data/processed/y_test.xlsx")

y_train = y_train.squeeze()
y_test = y_test.squeeze()

In [3]:
train_datasets = {
    "WITH_outliers": X_train_WITH_outliers,
    "WITH_outliers_norm": X_train_WITH_outliers_norm,
    "WITH_outliers_scal": X_train_WITH_outliers_scal,
    "WITHOUT_outliers": X_train_WITHOUT_outliers,
    "WITHOUT_outliers_norm": X_train_WITHOUT_outliers_norm,
    "WITHOUT_outliers_scal": X_train_WITHOUT_outliers_scal
}

test_datasets = {
    "WITH_outliers": X_test_WITH_outliers,
    "WITH_outliers_norm": X_test_WITH_outliers_norm,
    "WITH_outliers_scal": X_test_WITH_outliers_scal,
    "WITHOUT_outliers": X_test_WITHOUT_outliers,
    "WITHOUT_outliers_norm": X_test_WITHOUT_outliers_norm,
    "WITHOUT_outliers_scal": X_test_WITHOUT_outliers_scal
}

models = {}
metrics = {}

for name, X_train  in train_datasets.items():
    X_test = test_datasets[name]
    model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=10)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)

    acc = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    prec = precision_score(y_train, y_pred)
    rec = recall_score(y_train, y_pred)
    cm = confusion_matrix(y_train, y_pred)

    models[name] = model
    metrics[name] = {
        "Accuracy": acc,
        "F1": f1,
        "Precision": prec,
        "Recall": rec,
        "Confusion_Matrix": cm
    }

    print(
        f"{name}: "
        f"Accuracy={acc:.4f}, "
        f"F1={f1:.4f}, "
        f"Precision={prec:.4f}, "
        f"Recall={rec:.4f}"
    )
    print("Confusion Matrix:\n", cm)
    print("-" * 50)

    
best_dataset_name = max(metrics, key=lambda k: metrics[k]["Accuracy"])
best_model = models[best_dataset_name]
best_X_train = train_datasets[best_dataset_name]
best_X_test = test_datasets[best_dataset_name]


print("\n================ CONCLUSION ================")
print(f"Best dataset: {best_dataset_name}")
print(f"Accuracy: {metrics[best_dataset_name]['Accuracy']:.4f}")
print(f"F1-score: {metrics[best_dataset_name]['F1']:.4f}")
print(f"Precision: {metrics[best_dataset_name]['Precision']:.4f}")
print(f"Recall: {metrics[best_dataset_name]['Recall']:.4f}")

WITH_outliers: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
Confusion Matrix:
 [[405   0]
 [  0 209]]
--------------------------------------------------
WITH_outliers_norm: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
Confusion Matrix:
 [[405   0]
 [  0 209]]
--------------------------------------------------
WITH_outliers_scal: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
Confusion Matrix:
 [[405   0]
 [  0 209]]
--------------------------------------------------
WITHOUT_outliers: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
Confusion Matrix:
 [[405   0]
 [  0 209]]
--------------------------------------------------
WITHOUT_outliers_norm: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
Confusion Matrix:
 [[405   0]
 [  0 209]]
--------------------------------------------------
WITHOUT_outliers_scal: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
Confusion Matrix:
 [[405   0]
 [  0 209]]
--------------

It appears we have overfitting, so let's optimize and see if we can reduce it and improve the model.

In [4]:
params = {
    "n_estimators": [100, 200, 300],
    "bootstrap": [True, False],
    "max_depth": [None, 5, 10, 20],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2"]
}

In [5]:
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=10),
    params,
    scoring = "f1",
    cv= 5,
    n_jobs = -1)

grid_search.fit(best_X_train, y_train) # Entreno el optimizador con el dataset GANADOR

print("\n================ FINAL CONCLUSION ================")
print("Best params:", grid_search.best_params_)
best_model_grid = grid_search.best_estimator_
print("The best model is: ", best_model_grid)
print("The score for this model is: ", grid_search.best_score_)




Best params: {'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 5, 'n_estimators': 200}
The best model is:  RandomForestClassifier(bootstrap=False, max_features='log2', min_samples_leaf=5,
                       n_estimators=200, random_state=10)
The score for this model is:  0.6615318784766794


In [6]:
final_model = RandomForestClassifier(**grid_search.best_params_, random_state=10)
final_model.fit(best_X_train, y_train)

# =========================
# TRAIN
# =========================
y_pred_train = final_model.predict(best_X_train)

acc_train = accuracy_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
cm_train = confusion_matrix(y_train, y_pred_train)

print("========== TRAIN METRICS ==========")
print(f"Accuracy: {acc_train:.4f}")
print(f"F1-score: {f1_train:.4f}")
print(f"Precision: {prec_train:.4f}")
print(f"Recall: {rec_train:.4f}")
print("Confusion Matrix:\n", cm_train)
print("\nClassification Report:\n", classification_report(y_train, y_pred_train))

# =========================
# TEST
# =========================
y_pred_test = final_model.predict(best_X_test)

acc_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
cm_test = confusion_matrix(y_test, y_pred_test)

print("========== TEST METRICS ==========")
print(f"Accuracy: {acc_test:.4f}")
print(f"F1-score: {f1_test:.4f}")
print(f"Precision: {prec_test:.4f}")
print(f"Recall: {rec_test:.4f}")
print("Confusion Matrix:\n", cm_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

Accuracy: 0.9609
F1-score: 0.9423
Precision: 0.9469
Recall: 0.9378
Confusion Matrix:
 [[394  11]
 [ 13 196]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       405
           1       0.95      0.94      0.94       209

    accuracy                           0.96       614
   macro avg       0.96      0.96      0.96       614
weighted avg       0.96      0.96      0.96       614

Accuracy: 0.7597
F1-score: 0.6263
Precision: 0.7750
Recall: 0.5254
Confusion Matrix:
 [[86  9]
 [28 31]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.91      0.82        95
           1       0.78      0.53      0.63        59

    accuracy                           0.76       154
   macro avg       0.76      0.72      0.72       154
weighted avg       0.76      0.76      0.75       154



We have improved "f1" with respect to the decisiontree from 0.7287 to 0.9423, which is equivalent to 0.2136 points more (29.3% better)