In [3]:
# Cell 1: Setup environment
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import numpy as np
import pandas as pd

In [9]:
# Cell 2: Import libraries
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import tensorflow_decision_forests as tfdf

In [10]:
# Cell 3: Load data
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
serving_df = pd.read_csv("/kaggle/input/titanic/test.csv")

In [11]:
# Cell 4: Preprocessing
def preprocess(df):
    df = df.copy()
    df["Name"] = df["Name"].apply(lambda x: " ".join([v.strip(",()[].\"'") for v in x.split(" ")]))
    df["Ticket_number"] = df["Ticket"].apply(lambda x: x.split(" ")[-1])
    df["Ticket_item"] = df["Ticket"].apply(lambda x: "NONE" if len(x.split(" ")) == 1 else "_".join(x.split(" ")[0:-1]))
    return df

preprocessed_train_df = preprocess(train_df)
preprocessed_serving_df = preprocess(serving_df)

In [12]:
# Cell 5: Feature selection
input_features = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']

In [15]:
# Cell 6: Create datasets
def tokenize_names(features, labels=None):
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df, label="Survived").map(tokenize_names)
    serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(tokenize_names)


In [16]:
# Cell 7: Train ensemble
predictions = None
num_predictions = 0

for i in range(100):
    model = tfdf.keras.GradientBoostedTreesModel(
        verbose=0,
        features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
        exclude_non_specified_features=True,
        random_seed=i,
        honest=True,
    )
    model.fit(train_ds)
    
    sub_predictions = model.predict(serving_ds, verbose=0)[:,0]
    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions /= num_predictions

I0000 00:00:1766672029.529177   17899 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1766672029.530249   17899 kernel.cc:783] Collect training examples
I0000 00:00:1766672029.530272   17899 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
column_guides {
  column_name_pattern: "^Pclass$"
}
column_guides {
  column_name_pattern: "^Name$"
}
column_guides {
  column_name_pattern: "^Sex$"
}
column_guides {
  column_name_pattern: "^Age$"
}
column_guides {
  column_name_pattern: "^SibSp$"
}
column_guides {
  column_name_pattern: "^Parch$"
}
column_guides {
  column_name_pattern: "^Fare$"
}
column_guides {
  column_name_pattern: "^Cabin$"
}
column_guides {
  column_name_pattern: "^Embarked$"
}
column_guides {
  column_name_pattern: "^Ticket_number$"
}
column_guides {
  column_name_pattern: "^Ticket_item$"
}
default_column_guide {
  categorial {
    max_vocab_

In [17]:
# Cell 8: Create submission
submission = pd.DataFrame({
    "PassengerId": serving_df["PassengerId"],
    "Survived": (predictions >= 0.5).astype(int)
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Done")

Done
