# 1.1 Data Cleaning — Python (paso a paso)
# Carga del archivo desde la carpeta input

In [7]:
import pandas as pd
import numpy as np

# 0) Cargar datos
# Cambia la ruta al path exacto de tu repo: input/processed.cleveland.data

In [15]:
import pandas as pd

# Si el archivo está en la misma carpeta que tu notebook:
df = pd.read_csv("processed.cleveland.data", header=None)

print(df.head())


     0    1    2      3      4    5    6      7    8    9    10   11   12  13
0  63.0  1.0  1.0  145.0  233.0  1.0  2.0  150.0  0.0  2.3  3.0  0.0  6.0   0
1  67.0  1.0  4.0  160.0  286.0  0.0  2.0  108.0  1.0  1.5  2.0  3.0  3.0   2
2  67.0  1.0  4.0  120.0  229.0  0.0  2.0  129.0  1.0  2.6  2.0  2.0  7.0   1
3  37.0  1.0  3.0  130.0  250.0  0.0  0.0  187.0  0.0  3.5  3.0  0.0  3.0   0
4  41.0  0.0  2.0  130.0  204.0  0.0  2.0  172.0  0.0  1.4  1.0  0.0  3.0   0


# 1) Renombrar variables

In [17]:
cols_order = [
    'age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'hd'
]
df.columns = cols_order

# 2) Reemplazar los “?” por NaN y convertir a numérico

In [25]:
df = df.replace('?', np.nan)
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except Exception:
        pass


# 3) Eliminar filas con valores faltantes

In [28]:
df = df.dropna().reset_index(drop=True)

# 4) Especificar variables categóricas

In [33]:
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

for c in cat_cols:
    df[c] = df[c].astype('category')


# 5) Crear variable binaria y (1 = tiene enfermedad)

In [40]:
df['y'] = (df['hd'] > 0).astype(int)

# 6) Crear variables dummy

In [42]:
df_dum = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 7) Separar X e y

In [46]:
X = df_dum.drop(columns=['hd', 'y'])
y = df_dum['y']

# 8) Guardar archivos limpios

In [48]:
df_dum.to_csv("output/heart_clean_with_dummies.csv", index=False)
X.to_csv("output/X_features.csv", index=False)
y.to_csv("output/y_binary.csv", index=False)

# 9) Resumen

In [50]:
print("Filas finales:", df_dum.shape[0])
print("Columnas finales:", df_dum.shape[1])
print("Proporción de y=1:", y.mean().round(3))
print("Primeras columnas de X:", list(X.columns[:10]))

Filas finales: 297
Columnas finales: 22
Proporción de y=1: 0.461
Primeras columnas de X: ['age', 'restbp', 'chol', 'thalach', 'oldpeak', 'sex_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'fbs_1.0']


In [52]:
df = df.dropna().reset_index(drop=True)

# Mostrar resumen del DataFrame limpio
print("Tamaño después de dropna:", df.shape)
df.head()

Tamaño después de dropna: (297, 15)


Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd,y
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0


# 1.2 — Árbol de clasificación con CV de ccp_alpha (paso a paso)
# Requiere: X, y (de la sección 1.1). Si no existen, los carga desde /output.

In [57]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score
)


# 0) Cargar X,y si no están en memoria (usa tus archivos de la sección 1.1)

In [66]:
if 'X' not in globals() or 'y' not in globals():
    X = pd.read_csv("output/X_features.csv")
    y = pd.read_csv("output/y_binary.csv").squeeze("columns")  # serie 1D

os.makedirs("output", exist_ok=True)

# 1) Split train/test y árbol base (random_state=123)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=123
)

tree_base = DecisionTreeClassifier(random_state=123)
tree_base.fit(X_train, y_train)

plt.figure(figsize=(12, 7))
plot_tree(
    tree_base,
    feature_names=X.columns.tolist(),
    class_names=["Does not have HD", "Has HD"],
    filled=True, rounded=True, max_depth=3  # recorta la vista para legibilidad
)
plt.title("Classification Tree — Base model")
plt.tight_layout()
plt.savefig("output/tree_base_PY.png", dpi=150)
plt.close()

# Matriz de confusión del modelo base 

In [70]:
y_pred_base = tree_base.predict(X_test)
cm_base = confusion_matrix(y_test, y_pred_base, labels=[0,1])
disp_base = ConfusionMatrixDisplay(
    confusion_matrix=cm_base,
    display_labels=["Does not have HD", "Has HD"]
)
fig, ax = plt.subplots(figsize=(5, 5))
disp_base.plot(ax=ax, values_format="d", colorbar=False)
plt.title("Confusion Matrix — Base model")
plt.tight_layout()
plt.savefig("output/cm_base_PY.png", dpi=150)
plt.close()

print(f"[BASE] Accuracy: {accuracy_score(y_test, y_pred_base):.3f}")

[BASE] Accuracy: 0.789


# 2) Cross-validation para corregir overfitting con ccp_alpha  
#    - 50 valores de α en escala log entre e^{-10} y 0.05
#    - CV 4-fold, random_state=123

In [72]:
alpha_grid = np.exp(np.linspace(-10, np.log(0.05), 50))

param_grid = {"ccp_alpha": alpha_grid}
tree_for_cv = DecisionTreeClassifier(random_state=123)

cv = GridSearchCV(
    estimator=tree_for_cv,
    param_grid=param_grid,
    scoring="accuracy",
    cv=4,
    n_jobs=-1,
    refit=True,  # reentrena en todo el train con el mejor α
    return_train_score=True
)
cv.fit(X_train, y_train)

best_alpha = cv.best_params_["ccp_alpha"]
print(f"[CV] Best ccp_alpha: {best_alpha:.6g}")
print(f"[CV] Best CV accuracy: {cv.best_score_:.3f}")

[CV] Best ccp_alpha: 0.05
[CV] Best CV accuracy: 0.705


# 3) Curva de Inaccuracy = 1 - Accuracy contra α 

In [74]:
mean_val_acc = cv.cv_results_["mean_test_score"]
inaccuracy = 1.0 - mean_val_acc

plt.figure(figsize=(7, 5))
plt.plot(alpha_grid, inaccuracy, marker="o")
plt.xscale("log")
plt.xlabel("ccp_alpha (log scale)")
plt.ylabel("Inaccuracy = 1 - Accuracy")
plt.title("Inaccuracy vs ccp_alpha (4-fold CV)")
plt.grid(True, which="both", linestyle=":")
plt.tight_layout()
plt.savefig("output/inaccuracy_vs_alpha_PY.png", dpi=150)
plt.close()

# 4) Reentrenar con α óptimo y evaluar de nuevo  
#    - Árbol y matriz de confusión con α óptimo

In [76]:
tree_opt = DecisionTreeClassifier(random_state=123, ccp_alpha=best_alpha)
tree_opt.fit(X_train, y_train)

plt.figure(figsize=(12, 7))
plot_tree(
    tree_opt,
    feature_names=X.columns.tolist(),
    class_names=["Does not have HD", "Has HD"],
    filled=True, rounded=True, max_depth=3
)
plt.title(f"Classification Tree — Optimal ccp_alpha={best_alpha:.2e}")
plt.tight_layout()
plt.savefig("output/tree_opt_PY.png", dpi=150)
plt.close()

y_pred_opt = tree_opt.predict(X_test)
cm_opt = confusion_matrix(y_test, y_pred_opt, labels=[0,1])
disp_opt = ConfusionMatrixDisplay(
    confusion_matrix=cm_opt,
    display_labels=["Does not have HD", "Has HD"]
)
fig, ax = plt.subplots(figsize=(5, 5))
disp_opt.plot(ax=ax, values_format="d", colorbar=False)
plt.title("Confusion Matrix — Optimal ccp_alpha")
plt.tight_layout()
plt.savefig("output/cm_opt_PY.png", dpi=150)
plt.close()

print(f"[OPT ] Accuracy: {accuracy_score(y_test, y_pred_opt):.3f}")

[OPT ] Accuracy: 0.789


# 5) Guardar objetos auxiliares

In [78]:
pd.DataFrame({"ccp_alpha": alpha_grid, "cv_mean_accuracy": mean_val_acc}).to_csv(
    "output/cv_results_alpha_PY.csv", index=False
)
with open("output/best_alpha_PY.txt", "w") as f:
    f.write(str(best_alpha))