RANDOM FOREST

In [69]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from pickle import dump


In [70]:
csv_path = "/workspaces/Random_Forest_Santi_Izquierdo/data/raw/decision_tree_diabetes_optimized.csv"
sav_path = "/workspaces/Random_Forest_Santi_Izquierdo/models/decision_tree_diabetes_optimized.sav"
pred_csv_path = "/workspaces/Random_Forest_Santi_Izquierdo/models/random_forest_predictions.csv"


os.makedirs(os.path.dirname(csv_path), exist_ok=True)
os.makedirs(os.path.dirname(sav_path), exist_ok=True)

In [71]:
df = pd.read_csv(csv_path)
print("Dataset cargado correctamente")
print(df.head())

Dataset cargado correctamente
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [72]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Tama침o train: {X_train.shape}, Tama침o test: {X_test.shape}")

Tama침o train: (614, 8), Tama침o test: (154, 8)


In [73]:
best_model = RandomForestClassifier(
    n_estimators=200,   
    max_depth=8,         
    random_state=42
)

best_model.fit(X_train, y_train)
print("Modelo Random Forest entrenado correctamente.")

Modelo Random Forest entrenado correctamente.


In [74]:
y_pred_best = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred_best)
prec = precision_score(y_test, y_pred_best)
rec = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)

print("\n游늵 Resultados del modelo Random Forest:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")

print("\nReporte completo:")
print(classification_report(y_test, y_pred_best))


游늵 Resultados del modelo Random Forest:
Accuracy:  0.7403
Precision: 0.6522
Recall:    0.5556
F1-score:  0.6000

Reporte completo:
              precision    recall  f1-score   support

           0       0.78      0.84      0.81       100
           1       0.65      0.56      0.60        54

    accuracy                           0.74       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.74      0.73       154



In [75]:
dump(best_model, open(sav_path, "wb"))
print(f"\nModelo guardado correctamente en: {sav_path}")


Modelo guardado correctamente en: /workspaces/Random_Forest_Santi_Izquierdo/models/decision_tree_diabetes_optimized.sav


In [76]:
resultados_df = X_test.copy()
resultados_df["Real_Outcome"] = y_test.values
resultados_df["Predicted_Outcome"] = y_pred_best

resultados_df.to_csv(pred_csv_path, index=False)
print(f"Archivo CSV con predicciones guardado en: {pred_csv_path}")
print("\nPrimeras filas del CSV:")
print(resultados_df.head())

Archivo CSV con predicciones guardado en: /workspaces/Random_Forest_Santi_Izquierdo/models/random_forest_predictions.csv

Primeras filas del CSV:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
44             7      159             64              0        0  27.4   
672           10       68            106             23       49  35.5   
700            2      122             76             27      200  35.9   
630            7      114             64              0        0  27.4   
81             2       74              0              0        0   0.0   

     DiabetesPedigreeFunction  Age  Real_Outcome  Predicted_Outcome  
44                      0.294   40             0                  1  
672                     0.285   47             0                  0  
700                     0.483   26             0                  0  
630                     0.732   34             1                  0  
81                      0.102   22             0           