In [11]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer # Import SimpleImputer

from tensorflow.keras import layers, models, callbacks
from sklearn.pipeline import Pipeline

In [12]:
SEED = 43
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [13]:
URL = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(URL)


In [14]:
print("Shape bruto: ", df.shape)
print("Cols: ", list(df.columns))


Shape bruto:  (891, 12)
Cols:  ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [15]:
y = df["Survived"].astype(int).values

In [16]:
X = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].copy()

In [17]:
X["FamilySize"] = X["SibSp"].fillna(0) + X["Parch"].fillna(0) + 1
X["IsAlone"] = (X["FamilySize"] == 1).astype(int)

In [18]:
num_cols = ["Age", "SibSp", "Parch", "Fare", "FamilySize"]
cat_cols = ["Pclass", "Sex", "Embarked"]



In [19]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy= "median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy= "most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
]
)

In [20]:
from sklearn.pipeline import Pipeline
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ], remainder = "drop"
)


In [21]:
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=SEED, stratify=y
)


In [22]:
X_train = preprocess.fit_transform(X_train_df)
X_test = preprocess.transform(X_test_df)

X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

In [23]:
def build_model(input_dim: int) -> tf.keras.Model:
    model = models.Sequential(
        [layers.Input(shape=(input_dim,)),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.15),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.15),
        layers.Dense(1, activation="sigmoid")

         ]

    )
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model


model = build_model(X_train.shape[1])
model.summary()

In [24]:
cbs = [
    callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=12, restore_best_weights=True),
    callbacks.ModelCheckpoint("titanic_best_keras.keras", monitor="val_auc", mode="max", save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor="val_loaa", factor=0.5, patience=6)
]

In [25]:
hist = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=200,
)

Epoch 1/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.6540 - loss: 0.6838 - val_accuracy: 0.5455 - val_loss: 0.6910
Epoch 2/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6459 - loss: 0.6371 - val_accuracy: 0.5594 - val_loss: 0.6338
Epoch 3/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6969 - loss: 0.5775 - val_accuracy: 0.6154 - val_loss: 0.5995
Epoch 4/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6952 - loss: 0.5698 - val_accuracy: 0.6853 - val_loss: 0.5723
Epoch 5/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7512 - loss: 0.5347 - val_accuracy: 0.7203 - val_loss: 0.5492
Epoch 6/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7862 - loss: 0.5289 - val_accuracy: 0.7343 - val_loss: 0.5308
Epoch 7/200
[1m18/18[0m [32m

In [26]:


y_proba = model.predict(X_test).ravel()
y_pred = (y_proba >= 0.5).astype(int)

print("\nMatriz de confusion:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de clasificacion:\n", classification_report(y_test, y_pred, digits=4))
print("\nAUC: ", roc_auc_score(y_test, y_proba))

def predict_one(sample: dict) -> float:
  """
  Recibe un diccionario 'crudo' con las llaves esperadas:
  Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
  (FamilySize e IsAlone se calculan internamente).
  Devuelve probabilidad de supervivencia.
  """

  s = pd.DataFrame([sample])

  s["FamilySize"] = s["SibSp"].fillna(0) + s["Parch"].fillna(0) + 1
  s["IsAlone"] = (s["FamilySize"] == 1).astype(int)

  s_proc = preprocess.transform(s[X.columns])
  s_proc = s_proc.astype("float32")
  proba = model.predict(s_proc).item()
  return proba

  sample = {
      "Pclass": 3, #1, 2 o3
      "Sex": "male",#male / female
      "Age": 25,
      "SibSp": 1,
      "Parch": 0,
      "Fare": 7.25,
      "Embarked": "S" #S, C o Q
  }
  proba = predict_one(sample)
  print(f"Probabilidad de supervivencia: {proba:.4f}")
  print("Sobrevive" if proba >= 0.5 else "No sobrevive")


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Matriz de confusion:
 [[103   7]
 [ 23  46]]

Reporte de clasificacion:
               precision    recall  f1-score   support

           0     0.8175    0.9364    0.8729       110
           1     0.8679    0.6667    0.7541        69

    accuracy                         0.8324       179
   macro avg     0.8427    0.8015    0.8135       179
weighted avg     0.8369    0.8324    0.8271       179


AUC:  0.8587615283267457
