<a href="https://colab.research.google.com/github/AinaHerrera/ejerciciosIA/blob/main/Nivelacion_Modelo_de_Clasificacion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import os, sys, json, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, cohen_kappa_score
)
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

In [75]:
import kagglehub

# Download latest version
FILE_PATH = kagglehub.dataset_download("shashanknecrothapa/ames-housing-dataset")

print("Path to dataset files:", FILE_PATH)

Using Colab cache for faster access to the 'ames-housing-dataset' dataset.
Path to dataset files: /kaggle/input/ames-housing-dataset


In [76]:
CUTPOINT = 5
if "Rank" not in df.columns:
    assert "Overall Cond" in df.columns, "Falta 'Overall Cond' para construir Rank."
    df["Rank"] = np.where(df["Overall Cond"] <= CUTPOINT, "Low", "High")


print("Distribución Rank:\n", df["Rank"].value_counts().round(3) / len(df))

Distribución Rank:
 Rank
Low     0.621843
High    0.378157
Name: count, dtype: float64


In [78]:
TARGET_COL = "Rank"
POSITIVE_CLASS = "High"
RANDOM_STATE = 42
TEST_SIZE = 0.30

In [82]:
print("Columns in the DataFrame:", df.columns.tolist())

Columns in the DataFrame: ['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 

In [83]:
# Ensure leak_cols is defined before this cell
X = df.drop(columns=[TARGET_COL] + [c for c in leak_cols if c in df.columns])
y = df[TARGET_COL].astype(str)

In [84]:
df = df.dropna(subset=[TARGET_COL]).copy()

In [85]:
num_features = X.select_dtypes(include=["number"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print(f"Numéricas ({len(num_features)}): {num_features[:10]}{'...' if len(num_features)>10 else ''}")
print(f"Categóricas ({len(cat_features)}): {cat_features[:10]}{'...' if len(cat_features)>10 else ''}")

Numéricas (38): ['Order', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1']...
Categóricas (43): ['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1']...


In [86]:
num_transformer = make_pipeline(SimpleImputer(strategy="median"))
cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ],
    remainder="drop"
)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

In [88]:
base_clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
base_pipe = Pipeline(steps=[("prep", preprocessor), ("clf", base_clf)])
base_pipe.fit(X_train, y_train)
y_pred_base = base_pipe.predict(X_test)

In [63]:
def per_class_specificity(y_true, y_pred):
    """
    Calcula Specificity (TN / (TN + FP)) para cada clase.
    """
    labels = np.unique(y_true)
    spec = {}
    for lbl in labels:
        y_true_bin = (y_true == lbl).astype(int)
        y_pred_bin = (y_pred == lbl).astype(int)
        tn = np.sum((y_true_bin == 0) & (y_pred_bin == 0))
        fp = np.sum((y_true_bin == 0) & (y_pred_bin == 1))
        spec[lbl] = tn / (tn + fp) if (tn + fp) > 0 else np.nan
    return spec

def print_eval_block(title, y_true, y_pred):
    print("\n" + "="*70)
    print(title)
    print("="*70)
    acc = accuracy_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    print(f"Accuracy: {acc:.3f} | Cohen's kappa: {kappa:.3f}\n")

    print("Reporte por clase (Precision / Recall / F1):")
    print(classification_report(y_true, y_pred, digits=3))

    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    print("Matriz de confusión (filas=verdad, columnas=predicción):")
    print(pd.DataFrame(cm, index=[f"true_{l}" for l in np.unique(y_true)],
                          columns=[f"pred_{l}" for l in np.unique(y_true)]))

    spec = per_class_specificity(y_true, y_pred)
    print("\nSpecificity por clase:")
    print(pd.Series(spec).round(3))


print_eval_block("EVALUACIÓN - MODELO BASE", y_test, y_pred_base)


disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_base)
plt.title("Confusion Matrix - Base")
plt.tight_layout()
plt.savefig("confusion_matrix_base.png", dpi=150)
plt.close()


EVALUACIÓN - MODELO BASE
Accuracy: 1.000 | Cohen's kappa: 1.000

Reporte por clase (Precision / Recall / F1):
              precision    recall  f1-score   support

        High      1.000     1.000     1.000       332
         Low      1.000     1.000     1.000       547

    accuracy                          1.000       879
   macro avg      1.000     1.000     1.000       879
weighted avg      1.000     1.000     1.000       879

Matriz de confusión (filas=verdad, columnas=predicción):
           pred_High  pred_Low
true_High        332         0
true_Low           0       547

Specificity por clase:
High    1.0
Low     1.0
dtype: float64


In [89]:
param_grid = {
    "clf__criterion": ["gini", "entropy", "log_loss"],
    "clf__max_depth": [None, 3, 5, 10],
    "clf__min_samples_split": [2, 10, 20],
    "clf__min_samples_leaf": [1, 5, 10],
    "clf__class_weight": [None, "balanced"]
}
pipe = Pipeline(steps=[("prep", preprocessor),
                      ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))])

scoring = {
    "accuracy": "accuracy",
    "precision_macro": "precision_macro",
    "recall_macro": "recall_macro",
    "f1_macro": "f1_macro"
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)
print("\nMejores hiperparámetros (refit=f1_macro):")
print(grid.best_params_)
print(f"Mejor puntaje CV (f1_macro): {grid.best_score_:.3f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Mejores hiperparámetros (refit=f1_macro):
{'clf__class_weight': None, 'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 20}
Mejor puntaje CV (f1_macro): 0.769


In [90]:
cv_df = pd.DataFrame(grid.cv_results_)
cols_keep = [c for c in cv_df.columns if c.startswith("mean_test_")] + \
            [c for c in cv_df.columns if c.startswith("param_")]
cv_top = cv_df[cols_keep].sort_values("mean_test_f1_macro", ascending=False).head(10)
cv_top.to_csv("cv_results_top10.csv", index=False)
print("\nTop 10 combinaciones (por f1_macro):")
print(cv_top)


Top 10 combinaciones (por f1_macro):
     mean_test_accuracy  mean_test_precision_macro  mean_test_recall_macro  \
5              0.782534                   0.769155                0.769088   
65             0.776697                   0.766182                0.765668   
101            0.776697                   0.766182                0.765668   
79             0.776691                   0.763350                0.763395   
44             0.776691                   0.763350                0.763395   
43             0.776691                   0.763350                0.763395   
42             0.776691                   0.763350                0.763395   
80             0.776691                   0.763350                0.763395   
78             0.776691                   0.763350                0.763395   
7              0.775223                   0.761355                0.764222   

     mean_test_f1_macro param_clf__class_weight param_clf__criterion  \
5              0.768920        

In [91]:
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)
print_eval_block("EVALUACIÓN - MEJOR MODELO (GridSearchCV)", y_test, y_pred_best)


EVALUACIÓN - MEJOR MODELO (GridSearchCV)
Accuracy: 0.766 | Cohen's kappa: 0.496

Reporte por clase (Precision / Recall / F1):
              precision    recall  f1-score   support

        High      0.701     0.663     0.681       332
         Low      0.802     0.828     0.815       547

    accuracy                          0.766       879
   macro avg      0.751     0.745     0.748       879
weighted avg      0.764     0.766     0.764       879

Matriz de confusión (filas=verdad, columnas=predicción):
           pred_High  pred_Low
true_High        220       112
true_Low          94       453

Specificity por clase:
High    0.828
Low     0.663
dtype: float64


In [92]:
disp = ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test)
plt.title("Confusion Matrix - Best")
plt.tight_layout()
plt.savefig("confusion_matrix_best.png", dpi=150)
plt.close()

In [93]:
labels_unique = np.unique(y_train)
if len(labels_unique) == 2 and hasattr(best_model.named_steps["clf"], "predict_proba"):
    proba = best_model.predict_proba(X_test)
    classes_ = best_model.named_steps["clf"].classes_
    if POSITIVE_CLASS not in classes_:
        print(f"\n[Aviso] POSITIVE_CLASS '{POSITIVE_CLASS}' no está en {classes_}. Se usará classes_[1].")
        pos_idx = 1
        pos_label = classes_[pos_idx]
    else:
        pos_idx = list(classes_).index(POSITIVE_CLASS)
        pos_label = POSITIVE_CLASS

    other_label = [c for c in classes_ if c != pos_label][0]
    thresholds = np.linspace(0.20, 0.80, 13)

    rows = []
    for thr in thresholds:
        y_pred_thr = np.where(proba[:, pos_idx] >= thr, pos_label, other_label)
        # Métricas para la clase positiva
        p, r, f1, _ = precision_recall_fscore_support(
            y_test, y_pred_thr, labels=[pos_label], average="binary", pos_label=pos_label, zero_division=0
        )
        acc = accuracy_score(y_test, y_pred_thr)
        rows.append({"threshold": round(thr,3), "accuracy": acc, "precision_pos": p, "recall_pos": r, "f1_pos": f1})

    thr_df = pd.DataFrame(rows).sort_values(["f1_pos","recall_pos"], ascending=False)
    thr_df.to_csv("threshold_tuning.csv", index=False)
    print("\nAjuste de umbral para la clase positiva "
          f"('{pos_label}') - ordenado por F1 y Recall (top 8):")
    print(thr_df.head(8).round(3))
else:
    print("\n[Aviso] Salto ajuste de umbral (no binario o el clasificador no expone predict_proba).")


Ajuste de umbral para la clase positiva ('High') - ordenado por F1 y Recall (top 8):
   threshold  accuracy  precision_pos  recall_pos  f1_pos
0       0.20     0.741          0.619       0.816   0.704
1       0.25     0.745          0.630       0.786   0.700
3       0.35     0.761          0.669       0.729   0.697
4       0.40     0.758          0.669       0.711   0.689
2       0.30     0.743          0.635       0.750   0.688
5       0.45     0.765          0.690       0.684   0.687
6       0.50     0.766          0.701       0.663   0.681
7       0.55     0.763          0.705       0.642   0.672


In [94]:
joblib.dump(best_model, "decision_tree_best.joblib")
print("\nArtefactos guardados:")
for fp in ["confusion_matrix_base.png", "confusion_matrix_best.png",
           "cv_results_top10.csv", "threshold_tuning.csv",
           "decision_tree_best.joblib"]:
    if os.path.exists(fp):
        print(" -", fp)

print("\nFIN ✅")


Artefactos guardados:
 - confusion_matrix_base.png
 - confusion_matrix_best.png
 - cv_results_top10.csv
 - threshold_tuning.csv
 - decision_tree_best.joblib

FIN ✅
