# Explore here

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib
import json

# URL alternativa confiable del dataset de diabetes
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
df = pd.read_csv(url)

# Exploración inicial
print("Primeras filas:")
print(df.head())
print("\nResumen estadístico:")
print(df.describe())
print("\nValores faltantes:")
print(df.isnull().sum())

# Histogramas
df.hist(figsize=(12, 10))
plt.tight_layout()
plt.savefig('histograms.png')
plt.close()

# Correlación
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Matriz de Correlación')
plt.savefig('correlation_matrix.png')
plt.close()

# Preprocesamiento
# Reemplazar ceros con la mediana en columnas clave
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in zero_cols:
    df[col] = df[col].replace(0, np.nan)
    df[col].fillna(df[col].median(), inplace=True)

# Dividir datos
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nDatos de entrenamiento: {X_train.shape[0]} muestras")
print(f"Datos de prueba: {X_test.shape[0]} muestras")
print(f"Proporción de diabetes en entrenamiento: {y_train.mean():.2f}")
print(f"Proporción de diabetes en prueba: {y_test.mean():.2f}")

Primeras filas:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Resumen estadístico:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [10]:


# Entrenar árbol de decisión
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Evaluar
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"\n[ÁRBOL DE DECISIÓN] Precisión: {accuracy_dt:.2f}")

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred_dt)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicho')
plt.ylabel('Real')
plt.title('Matriz de Confusión - Árbol de Decisión')
plt.savefig('confusion_matrix_dt.png')
plt.close()


[ÁRBOL DE DECISIÓN] Precisión: 0.69


In [11]:
# Configurar parámetros para búsqueda
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Crear y entrenar modelo con búsqueda de hiperparámetros
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Mejores parámetros
best_params = grid_search.best_params_
print(f"\n[MEJORES HIPERPARÁMETROS] {best_params}")

# Modelo final con mejores parámetros
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"[RANDOM FOREST] Precisión: {accuracy_rf:.2f}")

# Comparación de modelos
print("\nCOMPARACIÓN DE MODELOS:")
print(f"- Árbol de Decisión: {accuracy_dt:.2f}")
print(f"- Random Forest: {accuracy_rf:.2f}")
print(f"Mejora: {(accuracy_rf - accuracy_dt)*100:.1f}%")

# Importancia de características
feature_importances = pd.Series(best_rf.feature_importances_, index=X.columns)
plt.figure(figsize=(10, 6))
feature_importances.sort_values().plot(kind='barh')
plt.title('Importancia de Características (Random Forest)')
plt.savefig('feature_importance.png')
plt.close()


[MEJORES HIPERPARÁMETROS] {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 50}
[RANDOM FOREST] Precisión: 0.74

COMPARACIÓN DE MODELOS:
- Árbol de Decisión: 0.69
- Random Forest: 0.74
Mejora: 4.3%


In [12]:
# Guardar modelo
joblib.dump(best_rf, 'random_forest_model.pkl')

# Guardar resultados
results = {
    'model': 'Random Forest',
    'accuracy': accuracy_rf,
    'best_params': best_params,
    'feature_importances': feature_importances.to_dict()
}

with open('model_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("\nModelo guardado como 'random_forest_model.pkl'")
print("Resultados guardados en 'model_results.json'")


Modelo guardado como 'random_forest_model.pkl'
Resultados guardados en 'model_results.json'
