VERSION PRELOIMINAR DE LA TAREA OPCIONAL DE CONCEPTO ABIERTO - HEMOS SELECCIONADO SMOTE PARA TRATAR EL DESBALANCEO

In [None]:
# EN PRIMER LUGAR SE IMPORTAN TODAS LAS LIBRERIAS
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipelin

# A CONTINUACION SE CARGAN LOS DATOS. 
available_df = pd.read_csv("/mnt/data/attrition_datasets/train_test/attrition_availabledata_03.csv.gz")
competition_df = pd.read_csv("/mnt/data/attrition_datasets/train_test/attrition_competition_03.csv.gz")

El siguiente paso es llevar a cabo la preparación de los datos para el procesado. 

In [None]:
X = available_df.drop("Attrition", axis=1)
y = available_df["Attrition"].map({"Yes": 1, "No": 0})

# identificamos las columnas
num_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
irrelevant = ["EmployeeID", "EmployeeCount", "Over18", "StandardHours"]
num_cols = [col for col in num_cols if col not in irrelevant]
cat_cols = [col for col in cat_cols if col not in irrelevant]
X = X.drop(columns=irrelevant)

# Finalmente dividimos en datos de entrenamiento y datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=3, stratify=y)


Continuamos llevando a cabo el preprocesado usando pipelines

In [None]:
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)

Generamos los modelos

In [None]:
models = {
    "LogReg (default)": LogisticRegression(max_iter=1000, random_state=3),
    "LogReg (L1)": LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=3),
    "SVM (linear)": SVC(kernel="linear", random_state=3),
    "SVM (rbf)": SVC(kernel="rbf", random_state=3)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("pre", preprocessor),
        ("clf", model)
    ])
    start = time.time()
    pipe.fit(X_train, y_train)
    end = time.time()
    y_pred = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    results[name] = {
        "Balanced Acc": balanced_accuracy_score(y_test, y_pred),
        "Accuracy": accuracy_score(y_test, y_pred),
        "TPR": tp / (tp + fn),
        "TNR": tn / (tn + fp),
        "Time (s)": end - start
    }

results_df = pd.DataFrame(results).T
print("\nEvaluaciones preliminares:\n")
print(results_df)

Seleccionamos el mejor modelo y construimos el modelo final

In [None]:
# Paso 5: Elegir el mejor modelo (ej. LogReg L1)
best_model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=3)
final_pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", best_model)
])

# Entrenamos el mejor modelo con los datos de train y de test
final_pipeline.fit(X_train, y_train)

# Hacemos el outer evaluation
y_final_pred = final_pipeline.predict(X_test)
print("\nEvaluación final del modelo elegido (LogReg L1):")
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_final_pred))
print("Accuracy:", accuracy_score(y_test, y_final_pred))

# Guardamos el modelo final
joblib.dump(final_pipeline, "modelo_final.pkl")

A continuacion: predecimos el conjunto de competición

X_competition = competition_df.drop(columns=irrelevant)
predictions = final_pipeline.predict(X_competition)
pd.DataFrame({"Attrition": predictions}).to_csv("predicciones.csv", index=False)
print("\nModelo y predicciones guardadas.")

 A PARTIR DE AQUI TAREA DE ELECCION ABIERTA!!!!!!! PUNTO 5
 HEMOS ELEGIDO HACER USO DE LA APLICACIÓN DE SMOTE PARA TRATAR EL DESBALANEO ENTRE CLASES

print("\nAplicando SMOTE para tratar el desbalanceo:")
smote_model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=3)

smote_pipeline = ImbPipeline([
    ("pre", preprocessor),
    ("smote", SMOTE(random_state=3)),
    ("clf", smote_model)
])

smote_pipeline.fit(X_train, y_train)
y_smote_pred = smote_pipeline.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_smote_pred).ravel()

print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_smote_pred))
print("Accuracy:", accuracy_score(y_test, y_smote_pred))
print("TPR (Sensibilidad):", tp / (tp + fn))
print("TNR (Especificidad):", tn / (tn + fp))