# 04_RandomForest_Diabetes

**Proyecto:** MLY0100 — Clasificación de Riesgo de Diabetes  
**Modelo:** Random Forest Classifier  
**Autor:** Antonio Sepúlveda  
**Fecha:** 2025


## 1. Conexión con Kedro y carga de datos
En este notebook entrenaremos un **Random Forest Classifier** utilizando el dataset limpio del pipeline Kedro.

**Dataset:** `diabetes_cleaned`


In [None]:
%load_ext kedro.ipython
%reload_kedro

# Listar datasets
catalog.list()

In [None]:
df_diabetes = catalog.load('diabetes_cleaned')
df_diabetes.head()

## 2. Distribución del Target (Outcome)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df_diabetes, x="Outcome")
plt.title("Distribución de Outcome")
plt.show()

df_diabetes['Outcome'].value_counts(normalize=True)

## 3. Importaciones del modelo y métricas

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, classification_report,
    roc_curve, auc, precision_recall_curve,
    average_precision_score
)

## 4. Selección de variables

In [None]:
X = df_diabetes.drop('Outcome', axis=1)
y = df_diabetes['Outcome']

X.head()

## 5. Split de entrenamiento y prueba (80/20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
X_train.shape, X_test.shape

## 6. Modelo Random Forest — Versión Base

In [None]:
rf_base = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42
)

rf_base.fit(X_train, y_train)
y_pred_base = rf_base.predict(X_test)
y_proba_base = rf_base.predict_proba(X_test)[:,1]

## 7. Métricas del modelo base

In [None]:
acc = accuracy_score(y_test, y_pred_base)
prec = precision_score(y_test, y_pred_base)
rec = recall_score(y_test, y_pred_base)
f1 = f1_score(y_test, y_pred_base)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print(classification_report(y_test, y_pred_base))

### 7.1 Matriz de confusión

In [None]:
cm = confusion_matrix(y_test, y_pred_base)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de Confusión — Random Forest Base")
plt.show()

cm

### 7.2 Sensitivity & Specificity

In [None]:
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")

## 8. Curvas ROC y Precision-Recall

In [None]:
# ROC
fpr, tpr, _ = roc_curve(y_test, y_proba_base)
auc_val = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {auc_val:.2f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Curva ROC — Random Forest Base")
plt.legend()
plt.grid(True)
plt.show()

# Precision-Recall
precision, recall, _ = precision_recall_curve(y_test, y_proba_base)
ap = average_precision_score(y_test, y_proba_base)

plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f"AP = {ap:.2f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Curva Precision-Recall — Random Forest Base")
plt.grid(True)
plt.legend()
plt.show()

## 9. Importancia de características

In [None]:
importances = pd.Series(rf_base.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(x=importances.values, y=importances.index)
plt.title("Importancia de Características — Random Forest Base")
plt.show()

importances

## 10. Optimización de hiperparámetros — GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 4, 6, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf = RandomForestClassifier(random_state=42)

grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print(grid_rf.best_params_)
print(grid_rf.best_score_)

## 11. Evaluación del mejor modelo

In [None]:
best_rf = grid_rf.best_estimator_

y_pred_best = best_rf.predict(X_test)
y_proba_best = best_rf.predict_proba(X_test)[:,1]

acc_b = accuracy_score(y_test, y_pred_best)
prec_b = precision_score(y_test, y_pred_best)
rec_b = recall_score(y_test, y_pred_best)
f1_b = f1_score(y_test, y_pred_best)

print(f"Accuracy  (Best): {acc_b:.4f}")
print(f"Precision (Best): {prec_b:.4f}")
print(f"Recall    (Best): {rec_b:.4f}")
print(f"F1-score  (Best): {f1_b:.4f}\n")

print(classification_report(y_test, y_pred_best))

### 11.1 Matriz de Confusión — Best Model

In [None]:
cm_b = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm_b, annot=True, fmt='d', cmap='Greens')
plt.title("Matriz de Confusión — Random Forest (Best Model)")
plt.show()

cm_b

### 11.2 Sensitivity & Specificity — Best Model

In [None]:
tn, fp, fn, tp = cm_b.ravel()
sensitivity_b = tp / (tp + fn)
specificity_b = tn / (tn + fp)

print(f"Sensitivity (Best): {sensitivity_b:.4f}")
print(f"Specificity (Best): {specificity_b:.4f}")

### 11.3 Curvas ROC y PR — Best Model

In [None]:
# ROC
fpr_b, tpr_b, _ = roc_curve(y_test, y_proba_best)
auc_b = auc(fpr_b, tpr_b)

plt.figure(figsize=(6,5))
plt.plot(fpr_b, tpr_b, label=f"AUC = {auc_b:.2f}")
plt.plot([0,1],[0,1],'--')
plt.title("Curva ROC — Random Forest Best Model")
plt.legend()
plt.grid(True)
plt.show()

# PR
precision_b, recall_b, _ = precision_recall_curve(y_test, y_proba_best)
ap_b = average_precision_score(y_test, y_proba_best)

plt.figure(figsize=(6,5))
plt.plot(recall_b, precision_b, label=f"AP = {ap_b:.2f}")
plt.title("Curva Precision-Recall — Random Forest Best Model")
plt.legend()
plt.grid(True)
plt.show()