# Decision Tree Classifier - Diabetes Dataset

## 1. Importar librerías

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

## 2. Cargar y preparar datos

In [None]:
# Cargar dataset
df_diabetes = pd.read_csv("diabetes_prediction_dataset.csv")
print(f"Shape del dataset: {df_diabetes.shape}")
df_diabetes.head()

In [None]:
# Información del dataset
df_diabetes.info()

In [None]:
# Distribución de la variable target
print("Distribución de diabetes:")
print(df_diabetes['diabetes'].value_counts())
print(f"\nPorcentaje con diabetes: {df_diabetes['diabetes'].mean()*100:.2f}%")

## 3. Preprocesamiento - One-Hot Encoding

In [None]:
# Aplicar one-hot encoding a variables categóricas
df_diabetes_encoded = pd.get_dummies(df_diabetes, columns=['smoking_history', 'gender'], drop_first=True)
print(f"Shape después de encoding: {df_diabetes_encoded.shape}")
df_diabetes_encoded.head()

## 4. Separar features y target

In [None]:
# Separar X (features) y y (target)
X = df_diabetes_encoded.drop(columns=['diabetes'])
y = df_diabetes_encoded['diabetes']

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nNombres de features:\n{list(X.columns)}")

## 5. Dividir en train y test

In [None]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 6. Entrenar Decision Tree Classifier

In [None]:
# Crear y entrenar el modelo
dtree = DecisionTreeClassifier(
    max_depth=5,           # Limitar profundidad para visualización
    min_samples_split=20,  # Mínimo de muestras para hacer split
    min_samples_leaf=10,   # Mínimo de muestras en hoja
    random_state=42
)

dtree.fit(X_train, y_train)
print("Modelo entrenado exitosamente")

## 7. Evaluación del modelo

In [None]:
# Predicciones
y_pred_train = dtree.predict(X_train)
y_pred_test = dtree.predict(X_test)

# Accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Accuracy en Train: {train_accuracy:.4f}")
print(f"Accuracy en Test: {test_accuracy:.4f}")

In [None]:
# Classification Report
print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))

In [None]:
# Matriz de confusión
cm = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Diabetes', 'Diabetes'],
            yticklabels=['No Diabetes', 'Diabetes'])
plt.title('Matriz de Confusión')
plt.ylabel('Real')
plt.xlabel('Predicho')
plt.show()

## 8. Importancia de features

In [None]:
# Importancia de características
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': dtree.feature_importances_
}).sort_values(by='importance', ascending=False)

print("Importancia de características:")
print(feature_importance)

In [None]:
# Visualizar importancia de features
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature', palette='viridis')
plt.title('Top 10 Features más importantes')
plt.xlabel('Importancia')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## 9. Visualización del árbol de decisión

In [None]:
# Plot del árbol completo
plt.figure(figsize=(25, 15))
plot_tree(
    dtree,
    feature_names=X.columns,
    class_names=['No Diabetes', 'Diabetes'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Decision Tree - Diabetes Prediction', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Plot del árbol con menos profundidad para mejor visualización
plt.figure(figsize=(20, 12))
plot_tree(
    dtree,
    feature_names=X.columns,
    class_names=['No Diabetes', 'Diabetes'],
    filled=True,
    rounded=True,
    fontsize=12,
    max_depth=3  # Solo primeros 3 niveles
)
plt.title('Decision Tree - Primeros 3 niveles', fontsize=16)
plt.tight_layout()
plt.show()

## 10. Predicción de ejemplo

In [None]:
# Seleccionar un paciente del test set
sample_idx = 0
sample = X_test.iloc[sample_idx]
real_value = y_test.iloc[sample_idx]

# Predicción
prediction = dtree.predict([sample])[0]
probabilities = dtree.predict_proba([sample])[0]

print("Datos del paciente:")
print(sample)
print(f"\nValor real: {'Diabetes' if real_value == 1 else 'No Diabetes'}")
print(f"Predicción: {'Diabetes' if prediction == 1 else 'No Diabetes'}")
print(f"\nProbabilidades:")
print(f"  No Diabetes: {probabilities[0]:.2%}")
print(f"  Diabetes: {probabilities[1]:.2%}")