5.AVANZADOS. (MÉTODOS BÁSICOS Y SVMs)

In [1]:
#Primero importamos todas las librerías necesarias para esta sección. 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score
import time

In [2]:
#Cargamos los datos
df_available = pd.read_csv("attrition_availabledata_03.csv")

In [4]:
#Preparamos los datos para ser procesados
X = df_available.drop("Attrition", axis=1)
y = df_available["Attrition"].map({"Yes": 1, "No": 0})

In [6]:
#Identificamos los distintos tipos de columnas ecistentes
num_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print(f"Num cols: {num_cols}")
print(f"Cat cols: {cat_cols}")

Num cols: ['hrs', 'absences', 'JobInvolvement', 'PerformanceRating', 'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeID', 'JobLevel', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Cat cols: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18']


In [None]:
#Eliminamos todas aquellas columnas que son innecesarias
irrelevant = ["EmployeeID", "EmployeeCount", "Over18", "StandardHours"]
num_cols = [col for col in num_cols if col not in irrelevant]
cat_cols = [col for col in cat_cols if col not in irrelevant]
X = X.drop(columns=irrelevant)

In [None]:
#Dividimos según train/test (2/3, 1/3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=3, stratify=y)

In [None]:
#Preprocesamos todos los datos 
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])

In [None]:
#Realizamos los Modelos
models = {
    "LogReg (default)": LogisticRegression(max_iter=1000, random_state=3),
    "LogReg (L1)": LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=3),
    "SVM (linear)": SVC(kernel="linear", random_state=3),
    "SVM (rbf)": SVC(kernel="rbf", random_state=3)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("pre", preprocessor),
        ("clf", model)
    ])
    start = time.time()
    pipe.fit(X_train, y_train)
    end = time.time()
    y_pred = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    results[name] = {
        "Balanced Acc": balanced_accuracy_score(y_test, y_pred),
        "Accuracy": accuracy_score(y_test, y_pred),
        "TPR": tp / (tp + fn),
        "TNR": tn / (tn + fp),
        "Time (s)": end - start
    }

In [None]:
#Imprimimos los Resultados
results_df = pd.DataFrame(results).T
print(results_df)