# Titanic
## Predict survival on the Titanic using Gradient Boost


# Load data into Pandas DataFrame

In [None]:
import pandas as pd

# Read into DataFrames
train = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")

# At first lets check the train DataFrame
test

# Check for `null` values

In [None]:
train.isnull().sum()

# Null values found in `Age: 177`, `Cabin: 687`, `Embarked: 2`

# Check data types

In [None]:
train.info()

# Check distribution of `Age` before imputing null values

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot a histogram with Age variable
plt.figure(figsize=(10, 6))
sns.histplot(train["Age"], bins=30, kde=True)
plt.title("Age distribution")
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Impute null values for `Age` variable where distribution is right-skewed

### When we get right-skewed distribution of a numerical variable the better practice is use `median` value for imputing missing values. We can't use `mean` value because it will not represent the central tendency of the distribution, because the mean value will come from the right area of distribution.
### But for the this time I will use MICE to impute those values because this technique especially good for non-normally distributed  or skewed data.

In [None]:
import miceforest as mf

# Prepare Sex columns
mapping = {'male': 1, 'female': 0}
train["Sex"] = train["Sex"].map(mapping)
test["Sex"] = test["Sex"].map(mapping)

# Create kernel for train dataset 
kernel = mf.ImputationKernel(data=train[["Age", "Survived", "Fare", "Sex"]], datasets=1, save_all_iterations=True)

# Launch the MICE imputation for train dataset
kernel.mice(10)

# Kernel for test dataset
kernel_test = mf.ImputationKernel(data=test[["Age", "Fare", "Sex"]], datasets=1, save_all_iterations=True)

# Launch the kernel_test
kernel_test.mice(10)

# Get imputed data for test and train datasets
imputed_data_train = kernel.complete_data(0)
imputed_data_test = kernel_test.complete_data(0)

# Replace old columns with new columns
train[["Age", "Survived", "Fare", "Sex"]] = imputed_data_train
test[["Age", "Fare", "Sex"]] = imputed_data_test

# Create subplots 
_, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 12))

# Plot for train dataset
sns.histplot(train['Age'], bins=30, kde=True, ax=axes[0])
axes[0].set_title("Train Dataset. Imputed Age")
axes[0].set_xlabel("Age")
axes[0].set_ylabel("Count")

# Plot for test dataset
sns.histplot(test['Age'], bins=30, kde=True, ax=axes[1])
axes[1].set_title("Test Dataset. Imputed Age")
axes[1].set_xlabel("Age")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
train.isnull().sum()

# Cabin 
### Which passengers had assigned cabins and which did not?  

In [None]:
train['Cabin'].fillna('Missing', inplace=True)
cabin = train['Cabin'].apply(lambda x: 'Has Cabin' if x != 'Missing' else 'Missing')
contingency_table = pd.crosstab(train['Pclass'], cabin)
_, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 6))

# Stacked Bar Plot
contingency_table.plot(kind='bar', stacked=True, colormap='viridis', ax=axes[0])
axes[0].set_title('Stacked Bar Plot of Pclass vs Cabin')
axes[0].set_xlabel('Pclass')
axes[0].set_ylabel('Count')

# Grouped Bar Plot
contingency_table.plot(kind='bar', colormap='viridis', ax=axes[1])
axes[1].set_title('Grouped Bar Plot of Pclass vs Cabin')
axes[1].set_xlabel('Pclass')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()


# There is a lot of missing values in Third and Second class tickets so it's better to just drop the `Cabin` column.


In [None]:
train.drop(columns=['Cabin'], inplace=True)
test.drop(columns=['Cabin'], inplace=True)

# Impute `Embarked` variable with mod value

In [None]:
# Impute with mod value
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)

# Drop `PassengerId`

In [None]:
train.drop(columns=['PassengerId'], inplace=True)
#test.drop(columns=['PassengerId'], inplace=True)

# Tokenize `Name` and `Ticket`


In [None]:
def tokenize(df: pd.DataFrame):
    """Tokenizing the columns"""
    df = df.copy()

    def name(x: str):
        """Splitting the string using space character then removing non-alphabetic characters"""
        return "".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def ticket_number(x: str):
        """Splitting the string by space and then taking the last element"""
        return x.split(" ")[-1]

    def ticket_item(x: str):
        """Splitting string then if length 2D array is 1 it means there's no item otherwise we take return all items"""
        item = x.split(" ")
        if len(item) == 1:
            return "NONE"
        return " ".join(item[0:-1])

    df["Name"] = df["Name"].apply(name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df


preprocessed_train = tokenize(train)
preprocessed_test = tokenize(test)

# Create input features list

In [None]:
input_features = list(preprocessed_train.columns)
input_features.remove("Ticket")
input_features.remove("Survived")
input_features

# Convert Pandas DataFrame to TensorFlow Dataset


In [None]:
import tensorflow as tf

# Configure TensorFlow to use the Apple Metal plugin before any other operations
print(tf.config.list_physical_devices())
import tensorflow_decision_forests as tfdf


def tokenize_name(features, labels=None):
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

# Convert Pandas DataFrame into tensorflow dataset

In [None]:
tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train, label="Survived").map(tokenize_name)
tf_test = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test).map(tokenize_name)

# Train Gradient Boost model with default parameters

In [None]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0,
    features=[tfdf.keras.FeatureUsage(n) for n in input_features],
    exclude_non_specified_features=True,
    random_seed=9
)
model.fit(tf_train)

self_eval = model.make_inspector().evaluation()
print(f"Accuracy={self_eval.accuracy}, Loss={self_eval.loss}")

In [None]:
model.summary()

# Training a model with hyperparameter tunning 

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
# Create a Random Search tuner with 50 trials.
tuner = tfdf.tuner.RandomSearch(num_trials=100)

# Define the search space.
#
# Adding more parameters generaly improve the quality of the model, but make
# the tuning last longer.

tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])

# Some hyper-parameters are only valid for specific values of other
# hyper-parameters. For example, the "max_depth" parameter is mostly useful when
# "growing_strategy=LOCAL" while "max_num_nodes" is better suited when
# "growing_strategy=BEST_FIRST_GLOBAL".

local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])

# merge=True indicates that the parameter (here "growing_strategy") is already
# defined, and that new values are added to it.
global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])

#tuner.choice("use_hessian_gain", [True, False])
tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])

# Uncomment some (or all) of the following hyper-parameters to increase the
# quality of the search. The number of trial should be increased accordingly.

tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",
                     ["NONE", "STANDARD_DEVIATION", "MIN_MAX"])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5])
best_model = None
best_accuracy = 0
best_loss = float('inf')

# Split the training data into training and validation sets
# train_df, val_df = train_test_split(preprocessed_train, test_size=0.2, random_state=42)
# 
# # Create TensorFlow datasets
# tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Survived").map(tokenize_name)
# tf_val = tfdf.keras.pd_dataframe_to_tf_dataset(val_df, label="Survived").map(tokenize_name)
with tf.device('/GPU:0'):
    for seed in range(100):
        print(f"SEED: {seed}\n")
        tuned_model = tfdf.keras.GradientBoostedTreesModel(
            verbose=0,
            features=[tfdf.keras.FeatureUsage(n) for n in input_features],
            exclude_non_specified_features=True,
            random_seed=seed,
            tuner=tuner,
            task=tfdf.keras.Task.CLASSIFICATION,
            sampling_method="GOSS", goss_alpha=0.15, goss_beta=0.15
        )
        tuned_model.fit(tf_train, verbose=0)

        tuned_self_evaluation = tuned_model.make_inspector().evaluation()
        print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")

        # Update the best model if this one is better
        if tuned_self_evaluation.accuracy > best_accuracy or (tuned_self_evaluation.accuracy == best_accuracy and tuned_self_evaluation.loss < best_loss):
            best_model = tuned_model
            best_accuracy = tuned_self_evaluation.accuracy
            best_loss = tuned_self_evaluation.loss

        print(f"Seed {seed}: Accuracy = {tuned_self_evaluation.accuracy:.4f}, Loss = {tuned_self_evaluation.loss:.4f}")
        print(f"\nBest model so far: Accuracy = {best_accuracy:.4f}, Loss = {best_loss:.4f}")
print(f"\nBest model: Accuracy = {best_accuracy:.4f}, Loss = {best_loss:.4f}")


In [None]:
# Evaluate the model
tuned_self_evaluation = best_model.make_inspector().evaluation()
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")

In [None]:

def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(tf_test, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path="./submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

kaggle_predictions = prediction_to_kaggle_format(best_model)
make_submission(kaggle_predictions)
!head ./submission.csv