<h2>Titanic competition w/ TensorFlow Decision Forests

This notebook will take you through the steps needed to train a baseline Gradient Boosted Trees Model using TensorFlow Decision Forests and creating a submission on the Titanic competition.

This notebook shows:

1. How to do some basic pre-processing. For example, the passenger names will be tokenized, and ticket names will be splitted in parts.
2. How to train a Gradient Boosted Trees (GBT) with default parameters
3. How to train a GBT with improved default parameters
4. How to tune the parameters of a GBTs
5. How to train and ensemble many GBTs

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [3]:
# Load the training and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')

df1 = [train_data, test_data]
df = pd.concat(df1)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
df.drop(['PassengerId','Name','SibSp','Ticket','Cabin','Embarked'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare
0,0.0,3,male,22.0,0,7.25
1,1.0,1,female,38.0,0,71.2833
2,1.0,3,female,26.0,0,7.925
3,1.0,1,female,35.0,0,53.1
4,0.0,3,male,35.0,0,8.05


In [5]:
target = df.Survived
inputs = df.drop('Survived', axis='columns')

In [6]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head(5)

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [7]:
inputs = pd.concat([inputs,dummies], axis='columns')
inputs.head(5)

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,female,male
0,3,male,22.0,0,7.25,False,True
1,1,female,38.0,0,71.2833,True,False
2,3,female,26.0,0,7.925,True,False
3,1,female,35.0,0,53.1,True,False
4,3,male,35.0,0,8.05,False,True


In [8]:
inputs.drop('Sex', axis='columns', inplace=True)
inputs

Unnamed: 0,Pclass,Age,Parch,Fare,female,male
0,3,22.0,0,7.2500,False,True
1,1,38.0,0,71.2833,True,False
2,3,26.0,0,7.9250,True,False
3,1,35.0,0,53.1000,True,False
4,3,35.0,0,8.0500,False,True
...,...,...,...,...,...,...
413,3,,0,8.0500,False,True
414,1,39.0,0,108.9000,True,False
415,3,38.5,0,7.2500,False,True
416,3,,0,8.0500,False,True


In [9]:
inputs.columns[inputs.isna().any()]

Index(['Age', 'Fare'], dtype='object')

In [10]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.Fare = inputs.Fare.fillna(inputs.Fare.mean())
inputs.head(10)

Unnamed: 0,Pclass,Age,Parch,Fare,female,male
0,3,22.0,0,7.25,False,True
1,1,38.0,0,71.2833,True,False
2,3,26.0,0,7.925,True,False
3,1,35.0,0,53.1,True,False
4,3,35.0,0,8.05,False,True
5,3,29.881138,0,8.4583,False,True
6,1,54.0,0,51.8625,False,True
7,3,2.0,1,21.075,False,True
8,3,27.0,2,11.1333,True,False
9,2,14.0,0,30.0708,True,False


In [11]:
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

In [13]:

print(np.isnan(y_train).sum())
print(np.isnan(y_test).sum())

345
73


In [14]:
import pandas as pd

X_train = X_train[~pd.isna(y_train)]
y_train = y_train[~pd.isna(y_train)]
X_test = X_test[~pd.isna(y_test)]
y_test = y_test[~pd.isna(y_test)]

In [15]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [16]:
model.fit(X_train, y_train)

In [17]:
model.score(X_test, y_test)

0.7724867724867724

In [25]:
inputs['Pclass'] 
inputs['Fare']

0        7.2500
1       71.2833
2        7.9250
3       53.1000
4        8.0500
         ...   
413      8.0500
414    108.9000
415      7.2500
416      8.0500
417     22.3583
Name: Fare, Length: 1309, dtype: float64

In [None]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        #The strip() method removes any leading, and trailing whitespaces.
        # You can specify which character(s) to remove, if not, any whitespaces will be removed.
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
preprocessed_train_df = preprocess(train_data)
preprocessed_serving_df = preprocess(test_data)

preprocessed_train_df.head(5)

In [None]:
preprocessed_train_df['Age'] = preprocessed_train_df['Age'].fillna(preprocessed_train_df['Age'].median())
preprocessed_train_df['Embarked'] = preprocessed_train_df['Embarked'].fillna(preprocessed_train_df['Embarked'].mode()[0])

In [None]:
preprocessed_train_df.head(15)

In [None]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
#input_features.remove("Survived")
input_features


In [None]:
def tokenize_names(features, labels=None):
    """Tokenize the names into tokens."""
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

def preprocess_data(df, categorical_columns):
    """Preprocess the dataset to handle categorical data."""
    # Initialize a dictionary to store the encoders for each categorical column
    label_encoders = {}
    
    # Convert categorical columns using Label Encoding
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)  # Ensure all values are strings before encoding
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Save the encoder for inverse transform if needed

    return df, label_encoders

# Define the categorical columns in your dataset
categorical_columns = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']

# Assuming preprocessed_train_df and preprocessed_serving_df are Pandas DataFrames
# Split the dataset into features and labels
X_train = preprocessed_train_df.drop(columns=["Survived"])
y_train = preprocessed_train_df["Survived"]
X_test = preprocessed_serving_df

# Preprocess categorical columns (including the tokenized "Name")
X_train, label_encoders = preprocess_data(X_train, categorical_columns)
X_test, _ = preprocess_data(X_test, categorical_columns)

# Convert the DataFrames to DMatrix (specific to XGBoost)
train_data = xgb.DMatrix(X_train, label=y_train)
test_data = xgb.DMatrix(X_test)

# Initialize the XGBoost Gradient Boosted Trees model
model = xgb.XGBClassifier(
    verbosity=0,  # Very few logs
    random_state=1234,
    use_label_encoder=False
)

# Train the model on the dataset
model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = model.predict(X_test)

# Print predictions
print(f"Predictions: {y_pred}")




In [None]:
# Assuming preprocessed_train_df and preprocessed_serving_df are Pandas DataFrames
# Split the dataset into features and labels
X_train = preprocessed_train_df.drop(columns=["Survived"])
y_train = preprocessed_train_df["Survived"]
X_test = preprocessed_serving_df  # Or use another dataset if available

# Encode categorical columns
label_encoders = {}
for column in ['Sex', 'Embarked', 'Ticket', 'Cabin']:
    le = LabelEncoder()
    
    # Fit on training data
    X_train[column] = le.fit_transform(X_train[column].astype(str))  # Convert to str and encode
    
    # Store the fitted encoder for later use
    label_encoders[column] = le
    
    # Transform test data, handling unseen labels
    unseen_label_value = len(le.classes_)  # Set unseen labels to a new value
    le_classes = list(le.classes_)  # Get the list of seen categories
    le_classes.append('unseen')  # Add a placeholder for unseen labels
    le.classes_ = le_classes  # Update classes with the placeholder
    
    # Apply the transformation, mapping unseen labels to a default value
    X_test[column] = X_test[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else unseen_label_value)

# For columns like 'Name' and 'Ticket' (if they are not useful for modeling), consider dropping them
X_train = X_train.drop(columns=["Name", "Ticket_number", "Ticket_item"])
X_test = X_test.drop(columns=["Name", "Ticket_number", "Ticket_item"])

# Convert the DataFrames to DMatrix (specific to XGBoost)
train_data = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
test_data = xgb.DMatrix(X_test, enable_categorical=True)

# Initialize the XGBoost Gradient Boosted Trees model with custom parameters
model = xgb.XGBClassifier(
    n_estimators=2000,
    max_depth=4,
    learning_rate=0.05,  # shrinkage
    subsample=0.8,  # control the ratio of the dataset randomly sampled each iteration
    colsample_bytree=0.8,  # control the ratio of features considered by each tree
    random_state=1234,
    verbosity=0,  # Minimal logging
    use_label_encoder=False
)

# Train the model on the dataset
model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get the probability of class 1 (Survived)

# Export predictions in Kaggle format
def prediction_to_kaggle_format(model, X_test, threshold=0.5):
    proba_survive = model.predict_proba(X_test)[:, 1]  # Probability of Survived == 1
    return pd.DataFrame({
        "PassengerId": preprocessed_serving_df["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path = "submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

# Generate the Kaggle predictions and create a submission file
kaggle_predictions = prediction_to_kaggle_format(model, X_test)
make_submission(kaggle_predictions)


# Print evaluation results (assuming you have a test set with labels)
# For demonstration, here you would typically use a validation set instead
# Evaluate accuracy on training data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy:.4f}")



# model.summary()