1. MÉTODOS BÁSICOS. (KNN Y ARBOLES DE DECISION)

In [4]:
#importamos todas las librerías necesarias. 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix

In [5]:

# Cargamos todos los datos
available_data_path = f"attrition_availabledata_03.csv"
df = pd.read_csv(available_data_path)

# Eliminamos todas las columnas irrelevantes
drop_columns = ["EmployeeID", "Over18", "StandardHours", "EmployeeCount"]
df = df.drop(columns=drop_columns, errors='ignore')

# Convertimos las variables categóricas a numéricas
categorical_columns = df.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

#Separamos en características (X) y variable objetivo (y)
X = df.drop(columns=["Attrition"])
y = df["Attrition"]

# Dividimos en conjunots: Train (2/3) y Test (1/3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=3, stratify=y)

# Escalamos e imputamos usando KNN como referencia
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

imputers = {
    "Mean": SimpleImputer(strategy="mean"),
    "Median": SimpleImputer(strategy="median")
}

# Evaluamos con KNN (k=5) para decidir el mejor método
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
best_score = 0
best_scaler = None
best_imputer = None

for scaler_name, scaler in scalers.items():
    for imputer_name, imputer in imputers.items():
        X_train_imputed = imputer.fit_transform(X_train)
        X_train_scaled = scaler.fit_transform(X_train_imputed)
        
        knn = KNeighborsClassifier(n_neighbors=5)
        score = np.mean(cross_val_score(knn, X_train_scaled, y_train, cv=kfold, scoring="balanced_accuracy"))
        
        print(f"Scaler: {scaler_name}, Imputer: {imputer_name}, Score: {score:.4f}")
        
        if score > best_score:
            best_score = score
            best_scaler = scaler
            best_imputer = imputer

# Aplicamos el mejor preprocesamiento encontrado
X_train_imputed = best_imputer.fit_transform(X_train)
X_test_imputed = best_imputer.transform(X_test)
X_train_scaled = best_scaler.fit_transform(X_train_imputed)
X_test_scaled = best_scaler.transform(X_test_imputed)

print(f"\nMejor método de escalado: {best_scaler}, Mejor método de imputación: {best_imputer}")


Scaler: StandardScaler, Imputer: Mean, Score: 0.5922
Scaler: StandardScaler, Imputer: Median, Score: 0.5928
Scaler: MinMaxScaler, Imputer: Mean, Score: 0.5922
Scaler: MinMaxScaler, Imputer: Median, Score: 0.5903
Scaler: RobustScaler, Imputer: Mean, Score: 0.6000
Scaler: RobustScaler, Imputer: Median, Score: 0.5987

Mejor método de escalado: RobustScaler(), Mejor método de imputación: SimpleImputer()


2. Entrenar y evaluar modelos base (KNN y Árboles de Decisión)

In [6]:

# definimos modelos base
knn = KNeighborsClassifier(n_neighbors=5)
tree = DecisionTreeClassifier(random_state=3)

# evaluamos con Cross-Validation
knn_score = np.mean(cross_val_score(knn, X_train_scaled, y_train, cv=kfold, scoring="balanced_accuracy"))
tree_score = np.mean(cross_val_score(tree, X_train_scaled, y_train, cv=kfold, scoring="balanced_accuracy"))

print(f"\nBalanced Accuracy KNN: {knn_score:.4f}")
print(f"Balanced Accuracy Árbol de Decisión: {tree_score:.4f}")



Balanced Accuracy KNN: 0.6000
Balanced Accuracy Árbol de Decisión: 0.8180


3. Optimización de hiperparámetros (HPO) con GridSearchCV

In [7]:

# definimos grids de hiperparámetros
knn_params = {"n_neighbors": [3, 5, 7, 9, 11]}
tree_params = {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]}

# optimizamos con GridSearchCV
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=kfold, scoring="balanced_accuracy", n_jobs=-1)
tree_grid = GridSearchCV(DecisionTreeClassifier(random_state=3), tree_params, cv=kfold, scoring="balanced_accuracy", n_jobs=-1)

# entrenamos GridSearchCV
knn_grid.fit(X_train_scaled, y_train)
tree_grid.fit(X_train_scaled, y_train)

# obtenemos mejores parámetros y scores
best_knn = knn_grid.best_estimator_
best_tree = tree_grid.best_estimator_

print(f"\nMejor KNN: {knn_grid.best_params_}, Balanced Accuracy: {knn_grid.best_score_:.4f}")
print(f"Mejor Árbol: {tree_grid.best_params_}, Balanced Accuracy: {tree_grid.best_score_:.4f}")



Mejor KNN: {'n_neighbors': 3}, Balanced Accuracy: 0.6589
Mejor Árbol: {'max_depth': None, 'min_samples_split': 2}, Balanced Accuracy: 0.8180


4.  Evaluación final en el conjunto de Test (Outer)

In [8]:

# predecimos y evaluamos
y_pred_knn = best_knn.predict(X_test_scaled)
y_pred_tree = best_tree.predict(X_test_scaled)

# Métricas para cada modelo
def evaluar_modelo(y_test, y_pred, model_name):
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    TPR = tp / (tp + fn)
    TNR = tn / (tn + fp)
    
    print(f"\n🔹 {model_name} - Evaluación en Test Set")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"True Positive Rate (TPR): {TPR:.4f}")
    print(f"True Negative Rate (TNR): {TNR:.4f}")
    print(f"Matriz de Confusión:\n{conf_matrix}")

evaluar_modelo(y_test, y_pred_knn, "KNN")
evaluar_modelo(y_test, y_pred_tree, "Árbol de Decisión")


🔹 KNN - Evaluación en Test Set
Balanced Accuracy: 0.6814
Accuracy: 0.8643
True Positive Rate (TPR): 0.4114
True Negative Rate (TNR): 0.9513
Matriz de Confusión:
[[782  40]
 [ 93  65]]

🔹 Árbol de Decisión - Evaluación en Test Set
Balanced Accuracy: 0.8559
Accuracy: 0.9255
True Positive Rate (TPR): 0.7532
True Negative Rate (TNR): 0.9586
Matriz de Confusión:
[[788  34]
 [ 39 119]]
