# Explore here

In [37]:
# Basics
import pandas as pd
import numpy as np

# VISUALIZACION
import matplotlib.pyplot as plt
import seaborn as sns

# ESCALAMIENTO
from sklearn.preprocessing import MinMaxScaler

# SELECCION DE PARAMETROS
from sklearn.feature_selection import chi2, SelectKBest

# MODELOS
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# METRICAS
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

# GUARDADO DEL MODELO
from pickle import dump

In [38]:
train_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/sergio-roque-decision-tree/refs/heads/main/data/processed/train_data.csv")
test_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/sergio-roque-decision-tree/refs/heads/main/data/processed/test_data.csv")

In [39]:
X_train = train_data.drop("Outcome", axis=1)
y_train = train_data["Outcome"]

X_test = test_data.drop("Outcome", axis=1)
y_test = test_data["Outcome"]

In [40]:
# Escalado
# Instancio el escalador
scaler = MinMaxScaler()

# Entreno el escalador con los datos de entrenamiento
scaler.fit(X_train)

# Aplico el escalador en ambos
X_train_scal = scaler.transform(X_train)
X_train_scal = pd.DataFrame(X_train_scal, index = X_train.index, columns = X_train.columns)

X_test_scal = scaler.transform(X_test)
X_test_scal = pd.DataFrame(X_test_scal, index = X_test.index, columns = X_test.columns)

X_train_scal.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.529412,0.696774,0.633333,0.293478,0.326403,0.503145,0.346318,0.2
1,0.117647,0.845161,0.8,0.23913,0.224532,0.147799,0.103596,0.016667
2,0.117647,0.541935,0.633333,0.326087,0.349272,0.789308,0.488014,0.166667
3,0.470588,0.406452,0.666667,0.23913,0.224532,0.201258,0.330479,0.216667
4,0.235294,0.509677,0.666667,0.086957,0.336798,0.433962,0.153682,0.216667


In [41]:
# Selección de parámetros
selection_model = SelectKBest(chi2, k = 5)
selection_model.fit(X_train_scal, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train_scal), columns = X_train_scal.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test_scal), columns = X_test_scal.columns.values[ix])

X_train_sel.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,Age
0,0.529412,0.696774,0.326403,0.503145,0.2
1,0.117647,0.845161,0.224532,0.147799,0.016667
2,0.117647,0.541935,0.349272,0.789308,0.166667
3,0.470588,0.406452,0.224532,0.201258,0.216667
4,0.235294,0.509677,0.336798,0.433962,0.216667


In [42]:
# modelo
# default n_estimators=100
model = RandomForestClassifier(random_state = 42)

# entrenamiento
model.fit(X_train_sel, y_train)

In [43]:
# predicción
y_pred_test = model.predict(X_test_sel)
y_pred_test

array([1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [44]:
y_pred_train = model.predict(X_train_sel)
y_pred_train

array([1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,

In [45]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

f1_score_test = f1_score(y_test, y_pred_test, average='micro')
f1_score_train = f1_score(y_train, y_pred_train, average='micro')

precision_test = precision_score(y_test, y_pred_test, average='micro')
precision_train = precision_score(y_train, y_pred_train, average='micro')

recall_test = recall_score(y_test, y_pred_test, average='micro')
recall_train = recall_score(y_train, y_pred_train, average='micro')

print("Accuracy Test: ", accuracy_test)
print("F1 score Test: ", f1_score_test)
print("Precision Test: ", precision_test)
print("Recall Test: ", recall_test)

print("Accuracy Train: ", accuracy_train)
print("F1 score Train: ", f1_score_train)
print("Precision Train: ", precision_train)
print("Recall Train: ", recall_train)

Accuracy Test:  0.7647058823529411
F1 score Test:  0.7647058823529411
Precision Test:  0.7647058823529411
Recall Test:  0.7647058823529411
Accuracy Train:  1.0
F1 score Train:  1.0
Precision Train:  1.0
Recall Train:  1.0


In [61]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],  # evita valores demasiado bajos, 100+ es razonable
    'max_depth': [None, 10, 20, 30, 50],   # explora sin límite y con límites razonables
    'min_samples_split': [2, 5, 10],       # mantener
    'min_samples_leaf': [1, 2, 4, 10],     # añadimos 10 para bosques más suaves
    'max_features': ['sqrt', 'log2'],     # quitar None (ya no es tan recomendable desde 1.1+)
    'bootstrap': [True, False],           # importante: si False → overfitting, pero puede mejorar
    'criterion': ['gini', 'entropy']      # podrías agregar 'log_loss' si usas probs y sklearn >=1.1
}

In [62]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # validación cruzada de 5 folds
    scoring='accuracy'  # Métrica a optimizar
)

In [63]:
grid_search.fit(X_train_sel, y_train)

KeyboardInterrupt: 

In [56]:
print("Mejores parámetros:", grid_search.best_params_)

Mejores parámetros: {'n_estimators': np.int64(42)}


In [59]:
# modelo
model = RandomForestClassifier(n_estimators=42, criterion="entropy", max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=10, random_state = 42)

# entrenamiento
model.fit(X_train_sel, y_train)

In [60]:
# predicción
y_pred_test = model.predict(X_test_sel)
y_pred_train = model.predict(X_train_sel)

# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

f1_score_test = f1_score(y_test, y_pred_test, average='micro')
f1_score_train = f1_score(y_train, y_pred_train, average='micro')

precision_test = precision_score(y_test, y_pred_test, average='micro')
precision_train = precision_score(y_train, y_pred_train, average='micro')

recall_test = recall_score(y_test, y_pred_test, average='micro')
recall_train = recall_score(y_train, y_pred_train, average='micro')

print("Accuracy Test: ", accuracy_test)
print("F1 score Test: ", f1_score_test)
print("Precision Test: ", precision_test)
print("Recall Test: ", recall_test)

print("Accuracy Train: ", accuracy_train)
print("F1 score Train: ", f1_score_train)
print("Precision Train: ", precision_train)
print("Recall Train: ", recall_train)

Accuracy Test:  0.7908496732026143
F1 score Test:  0.7908496732026143
Precision Test:  0.7908496732026143
Recall Test:  0.7908496732026143
Accuracy Train:  0.8360655737704918
F1 score Train:  0.8360655737704918
Precision Train:  0.8360655737704918
Recall Train:  0.8360655737704918
