In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from tensorflow.keras import layers, models, callbacks
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("/content/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
print("Shape bruto", df.shape)
print("Cols", list(df.columns))

Shape bruto (1470, 35)
Cols ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [None]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Attrition,0
BusinessTravel,0
DailyRate,0
Department,0
DistanceFromHome,0
Education,0
EducationField,0
EmployeeCount,0
EmployeeNumber,0


In [None]:
y = df["Attrition"].map({"Yes": 1, "No": 0}).astype(int).values
X = df.drop(columns=["Attrition", "EmployeeNumber", "Over18", "EmployeeCount", "StandardHours"])


In [None]:
num_cols = X.select_dtypes("int64", "float64").columns.tolist()
cat_cols = X.select_dtypes("object").columns.tolist()

In [None]:
print("Númericas: ", num_cols)
print("Categóricas: ", cat_cols)

Númericas:  ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Categóricas:  ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']


In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps= [
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ])

In [None]:
X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train = preprocess.fit_transform(X_train_df).astype("float32")
X_test = preprocess.transform(X_test_df).astype("float32")

In [None]:
print("input dims: ", X_train.shape[1])

input dims:  51


In [None]:
def build_model(input_dim: int) -> tf.keras.Model:
  model = models.Sequential([
      layers.Input(shape=(input_dim,), ),
      layers.Dense(64, activation="relu"),
      layers.Dropout(0.20),
      layers.Dense(32, activation="relu"),
      layers.Dense(1, activation="sigmoid")
  ])
  model.compile(
      optimizer="adam",
      loss="binary_crossentropy",
      metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
  )
  return model


In [None]:
model = build_model(X_train.shape[1])
model.summary()

In [None]:
cbs = [
    callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=12, restore_best_weights=True),
    callbacks.ModelCheckpoint(filepath="attrition_best.keras", monitor="val_auc", mode="max", save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=6)
]

In [None]:
hist = model.fit(
    X_train, y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=200,
    callbacks=cbs,
    verbose=1
)

Epoch 1/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8242 - auc: 0.5314 - loss: 0.4894 - val_accuracy: 0.8305 - val_auc: 0.7383 - val_loss: 0.4120 - learning_rate: 0.0010
Epoch 2/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8440 - auc: 0.7115 - loss: 0.3943 - val_accuracy: 0.8305 - val_auc: 0.8078 - val_loss: 0.3842 - learning_rate: 0.0010
Epoch 3/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8570 - auc: 0.7435 - loss: 0.3635 - val_accuracy: 0.8305 - val_auc: 0.8407 - val_loss: 0.3653 - learning_rate: 0.0010
Epoch 4/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8478 - auc: 0.8217 - loss: 0.3524 - val_accuracy: 0.8390 - val_auc: 0.8505 - val_loss: 0.3514 - learning_rate: 0.0010
Epoch 5/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8587 - auc: 0.8460 - loss: 0.326

In [None]:
y_proba = model.predict(X_test).ravel()
y_pred = (y_proba >= 0.5).astype(int)

print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred, digits=4))
print("\nAUC: ", roc_auc_score(y_test, y_proba))

NameError: name 'model' is not defined