In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
!pip install scikeras
from scikeras.wrappers import KerasClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
print("Setup Complete.")

In [None]:
print("TensorFlow v" + tf.__version__)
print("TensorFlow Decision Forests v" + tfdf.__version__)

# Load Datasets

In [None]:
# Load a dataset into a Pandas Dataframe
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Organize Data

Using copies of datasets to maintain originals so I don't have to revert version every time I want to make changes. 

Splitting with train_test_split so I can utilize accuracy_score by sklearn after making predictions.

In [None]:
# Making copies to maintain original dataset
X = dataset_df.copy()
y = X.Transported
X_test = test_df.copy()

# Explore Data
Took majority from starter code notebook, slightly modified for my copied datasets.

In [None]:
X.head(5)

In [None]:
X.describe()

In [None]:
X.info()

In [None]:
plot_df = X.Transported.value_counts()
plot_df.plot(kind="bar")

# Numerical data distribution
Plot all numerical columns with value counts

In [None]:
fig, ax = plt.subplots(5,1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)

sns.histplot(X['Age'], color='b', bins=50, ax=ax[0]);
sns.histplot(X['FoodCourt'], color='b', bins=50, ax=ax[1]);
sns.histplot(X['ShoppingMall'], color='b', bins=50, ax=ax[2]);
sns.histplot(X['Spa'], color='b', bins=50, ax=ax[3]);
sns.histplot(X['VRDeck'], color='b', bins=50, ax=ax[4]);

# Prepare Data
Dropping unecessary columns, filling missing values with 0, and converting boolean fields to int due to lack of TF-DF support. 

There's no need to encode categorical variables as TF-DF handles them natively.

In [None]:
X = X.drop(['PassengerId', 'Name'], axis=1)
# Display updated data
X.head(5)

In [None]:
# Show missing value counts
X.isnull().sum().sort_values(ascending=False)

In [None]:
X[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = X[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
X.isnull().sum().sort_values(ascending=False)

In [None]:
label = "Transported"
X[label] = X[label].astype(int)

In [None]:
X['VIP'] = X['VIP'].astype(int)
X['CryoSleep'] = X['CryoSleep'].astype(int)

Replacing Cabin with individual features deck, Cabin number, and side. 

Then removing unecessary Cabin feature from dataset.

In [None]:
X[["Deck", "Cabin_num", "Side"]] = X["Cabin"].str.split("/", expand=True)

In [None]:
try:
    X = X.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

In [None]:
# Redisplay updated data
X.head(5)

# Splitting dataset for training and evaluation

In [None]:
# Splitting 20% of training set into additional validation set before local feature engineering steps
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.20, random_state=1)

# Making training and validation sets compatible with Tensorflow

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_X, label=label)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_X, label=label)

# Configure GBT Model with best parameters manually

In [None]:
# Create model function for Keras Classifier wrapper
def create_model():
    model = tfdf.keras.GradientBoostedTreesModel()
    
    # Compile model
    model.compile(metrics=["accuracy"])
    return model

In [None]:
# Wrapper for scikit-learn + keras compatibility
model = KerasClassifier(build_fn=create_model, verbose=0)

# Cycling through best parameters to find best fit for model
params_grid = {'n_estimators': [15, 25, 50, 100, 200, 400],
               'learning_rate': [0.1, 0.01, 0.001, 0.0001],
               'subsample': [1.0, 0.5, 0.65],
               'sampling_method': ['CART', 'RANDOM'],
               'max_depth': [1, 5]}

# Grid search object
search_gbt = GridSearchCV(estimator=model,
                         param_grid=params_grid,
                         scoring='accuracy',
                         n_jobs=-1)

# Perform grid search
# search_gbt = search_gbt.fit(X,y) -- DEBUG this line

In [None]:
# Best model
search_gbt.best_estimator_

# Evaluate best model

In [None]:
# Predictions on validation holdout set first
y_pred = search_gbt.predict(valid_ds)
print(classification_report(y_pred, val_y))

In [None]:
cf = confusion_matrix(y_test, val_y, normalize='true')
display_cf = ConfusionMatrixDisplay(confusion_matrix=cf,display_labels=search_gbt.classes_)
display_cf.plt(values_format='.2f')
plt.show()