In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from tensorflow.keras import layers, models, callbacks
from sklearn.pipeline import Pipeline

In [None]:
# DATASET: MEDICAL INSURANCE COST DATASET

df = pd.read_csv("/content/insurance.csv")

print("Shape bruto", df.shape)
print("Cols", list(df.columns))

df.isnull().sum()

Shape bruto (1338, 7)
Cols ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']


Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [None]:
y = df["smoker"].map({"yes": 1, "no": 0}).astype(int).values # valor que vamos a predecir: smoker (si fuma o no)
X = df.drop("smoker", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1070, 6)
X_test shape: (268, 6)
y_train shape: (1070,)
y_test shape: (268,)


In [None]:
# COLUMNAS
cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(include=np.number).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

X_train_preprocessed = preprocessor.fit_transform(X_train)

X_test_preprocessed = preprocessor.transform(X_test)

print(f"X_train_preprocessed shape: {X_train_preprocessed.shape}")
print(f"X_test_preprocessed shape: {X_test_preprocessed.shape}")

X_train_preprocessed shape: (1070, 10)
X_test_preprocessed shape: (268, 10)


In [None]:
def build_model(input_dim: int) -> tf.keras.Model:
  model = models.Sequential([
      layers.Input(shape=(input_dim,), ),
      layers.Dense(16, activation="relu"), #UTILIZAMOS ESTA CANTIDAD DE NEURONAS DEBIDO A LOS POCOS DATOS EN EL DATASET (< 1400). NOS BASAMOS EN MAS O MENOS LA CANTIDAD DE DATOS DE ENTRADA. 16 ES POTENCIA DE 2 Y SUFICIENTE EN ESTE CONTEXTO, PARA EVITRAR EL SOBREAJUSTE
      layers.Dropout(0.20),
      layers.Dense(8, activation="relu"),  # USAMOS 8 NEURONAS PARA REFINAR.
      layers.Dropout(0.20),                # DEBIDO A LA CANTIDAD DE DATOS, Y HACER QUE EL MODELO APRENDA Y NO MEMORICE, DESCARTAMOS UN 20%.
      layers.Dense(1, activation="sigmoid")
  ])
  model.compile(
      optimizer="adam",
      loss="binary_crossentropy",
      metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
  )
  return model


In [None]:
model = build_model(X_train_preprocessed.shape[1])
model.summary()

In [None]:
cbs = [
    callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=12, restore_best_weights=True),
    callbacks.ModelCheckpoint(filepath="attrition_best.keras", monitor="val_auc", mode="max", save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=6)
]

In [None]:
hist = model.fit(
    X_train_preprocessed, y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=50,
    callbacks=cbs,
    verbose=1
)

Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.7914 - auc: 0.7605 - loss: 0.4873 - val_accuracy: 0.8271 - val_auc: 0.8769 - val_loss: 0.3951 - learning_rate: 0.0010
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8189 - auc: 0.8517 - loss: 0.4272 - val_accuracy: 0.8271 - val_auc: 0.9461 - val_loss: 0.3409 - learning_rate: 0.0010
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8278 - auc: 0.8808 - loss: 0.3869 - val_accuracy: 0.8505 - val_auc: 0.9702 - val_loss: 0.2988 - learning_rate: 0.0010
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8417 - auc: 0.8853 - loss: 0.3583 - val_accuracy: 0.8692 - val_auc: 0.9765 - val_loss: 0.2647 - learning_rate: 0.0010
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8255 - auc: 0.9191 - loss: 0.3415 - v

In [None]:
loss, accuracy, auc_metric = model.evaluate(X_test_preprocessed, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc_metric:.4f}")


y_pred_prob = model.predict(X_test_preprocessed)
y_pred = (y_pred_prob > 0.5).astype("int32")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nAUC: {roc_auc:.4f}")

Test Loss: 0.1340
Test Accuracy: 0.9291
Test AUC: 0.9877
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       214
           1       0.87      0.76      0.81        54

    accuracy                           0.93       268
   macro avg       0.91      0.87      0.88       268
weighted avg       0.93      0.93      0.93       268


Confusion Matrix:
[[208   6]
 [ 13  41]]

AUC: 0.9876
