In [1]:
import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()

import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")


Downloading train.csv
Downloading test.csv


The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class.
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic


In [2]:
#set the `PassengerId` column as the index column
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")


In [5]:
#Let'sbuild the pipeline for the categorical attributes
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse_output=False)),
])

In [6]:
# Let's join the numerical and categorical pipelines:
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [7]:
#Preprocessing pipeline that takes the raw data and outputs numerical input features that we can feed to any Machine Learning model we want:
X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])
X_train

array([[-0.56573582,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.6638609 ,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.10463705,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276213, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [8]:
#Labeling
y_train = train_data["Survived"]

X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])


In [9]:
!pip install tensorflow



In [14]:
print(X_train.shape)

(891, 12)


In [18]:
#  Preprocessing on the full labelled set
X_full = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs]
)
y_full = train_data["Survived"].values

# Hold-out 20 % of the rows for validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full,
    test_size=0.20,          # 80 / 20 split
    stratify=y_full,         # keep class balance
    random_state=42
)


In [19]:
# Build and compile the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(100, activation='relu'),
    Dense(1,   activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [20]:
#Training, while Keras tracks validation accuracy each epoch
history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=2
)

#Final score on the held-out set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"\nValidation accuracy: {val_acc:.4f}")

Epoch 1/40
23/23 - 1s - 58ms/step - accuracy: 0.6433 - loss: 0.6570 - val_accuracy: 0.7095 - val_loss: 0.6048
Epoch 2/40
23/23 - 0s - 20ms/step - accuracy: 0.7542 - loss: 0.5607 - val_accuracy: 0.7430 - val_loss: 0.5520
Epoch 3/40
23/23 - 0s - 13ms/step - accuracy: 0.7949 - loss: 0.5111 - val_accuracy: 0.7709 - val_loss: 0.5161
Epoch 4/40
23/23 - 0s - 7ms/step - accuracy: 0.8062 - loss: 0.4770 - val_accuracy: 0.7933 - val_loss: 0.4919
Epoch 5/40
23/23 - 0s - 6ms/step - accuracy: 0.8301 - loss: 0.4544 - val_accuracy: 0.7933 - val_loss: 0.4762
Epoch 6/40
23/23 - 0s - 7ms/step - accuracy: 0.8272 - loss: 0.4402 - val_accuracy: 0.8045 - val_loss: 0.4634
Epoch 7/40
23/23 - 0s - 6ms/step - accuracy: 0.8230 - loss: 0.4309 - val_accuracy: 0.8101 - val_loss: 0.4571
Epoch 8/40
23/23 - 0s - 6ms/step - accuracy: 0.8272 - loss: 0.4240 - val_accuracy: 0.8156 - val_loss: 0.4533
Epoch 9/40
23/23 - 0s - 12ms/step - accuracy: 0.8216 - loss: 0.4193 - val_accuracy: 0.8268 - val_loss: 0.4488
Epoch 10/40
23/

In [21]:
# Detailed metrics
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

y_val_pred = (model.predict(X_val, verbose=0) >= 0.5).astype(int).ravel()
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=3))


[[102   8]
 [ 29  40]]
              precision    recall  f1-score   support

           0      0.779     0.927     0.846       110
           1      0.833     0.580     0.684        69

    accuracy                          0.793       179
   macro avg      0.806     0.753     0.765       179
weighted avg      0.800     0.793     0.784       179

