POSIBILIDAD DE MODELO FINAL

In [None]:
#IMPORTAMOS TODAS LAS LIBRERIAS NECESARIAS
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score

In [None]:
#CARGAMOS TODOS LOS DATOS
available_df = pd.read_csv("/mnt/data/attrition_datasets/train_test/attrition_availabledata_03.csv.gz")
competition_df = pd.read_csv("/mnt/data/attrition_datasets/train_test/attrition_competition_03.csv.gz")

In [None]:
#PREPARAMOS TODOS LOS DATOS PARA EL ENTRENAMIENTO, Y LOS DIVIDIMOS SEGÚN SU UTILIDAD


X = available_df.drop("Attrition", axis=1)
y = available_df["Attrition"].map({"Yes": 1, "No": 0})

# IDENTIFICAMOS tipos de columnas
num_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
irrelevant = ["EmployeeID", "EmployeeCount", "Over18", "StandardHours"]
num_cols = [col for col in num_cols if col not in irrelevant]
cat_cols = [col for col in cat_cols if col not in irrelevant]
X = X.drop(columns=irrelevant)

# DIVIDIMOS en train/test (2/3, 1/3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=3, stratify=y)

In [None]:
#REALIZAMOS EL PREPROCESAMIENTO DE LOS DATOS
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)

In [None]:
#REALIZAMOS LOS MODELOS
models = {
    "LogReg (default)": LogisticRegression(max_iter=1000, random_state=3),
    "LogReg (L1)": LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=3),
    "SVM (linear)": SVC(kernel="linear", random_state=3),
    "SVM (rbf)": SVC(kernel="rbf", random_state=3)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("pre", preprocessor),
        ("clf", model)
    ])
    start = time.time()
    pipe.fit(X_train, y_train)
    end = time.time()
    y_pred = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    results[name] = {
        "Balanced Acc": balanced_accuracy_score(y_test, y_pred),
        "Accuracy": accuracy_score(y_test, y_pred),
        "TPR": tp / (tp + fn),
        "TNR": tn / (tn + fp),
        "Time (s)": end - start
    }

results_df = pd.DataFrame(results).T
print("\nEvaluaciones preliminares:\n")
print(results_df)

In [None]:
#ELEJIMOS EL MEJOR MODELO DE LOS RESULTADOS Y ENTRENAMOS EL MODELO FINAL SELECCIONADO CON TODOS LOS DATOS
# Paso 5: Elegir el mejor modelo (ej. LogReg L1)
best_model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=3)
final_pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", best_model)
])

# Entrenar el modelo final con todos los datos de entrenamiento
final_pipeline.fit(X_train, y_train)


In [None]:
#EVALUAMOS EL MODELO FINAL CON EL CONJUNTO DE TEST (EL OUTER)
# Evaluación final en el conjunto de test (outer)
y_final_pred = final_pipeline.predict(X_test)
print("\nEvaluación final del modelo elegido (LogReg L1):")
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_final_pred))
print("Accuracy:", accuracy_score(y_test, y_final_pred))

# Paso 6: Guardar el modelo final
joblib.dump(final_pipeline, "modelo_final.pkl")

In [None]:

# REALIZAMOS PREDICCIONES en el conjunto de competición
X_competition = competition_df.drop(columns=irrelevant)
predictions = final_pipeline.predict(X_competition)
pd.DataFrame({"Attrition": predictions}).to_csv("predicciones.csv", index=False)
print("\nModelo y predicciones guardadas.")
