In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # For validation holdout set
!pip install sweetviz
import sweetviz as sv
from pandas_profiling import ProfileReport # Profiling for initial EDA

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
print("Setup Complete.")

In [None]:
print("TensorFlow v" + tf.__version__)
print("TensorFlow Decision Forests v" + tfdf.__version__)

# Load Datasets

In [None]:
# Load a dataset into a Pandas Dataframe
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Organize Data

Using copies of datasets to maintain originals so I don't have to revert version every time I want to make changes. 

Splitting with train_test_split so I can utilize accuracy_score by sklearn after making predictions.

In [None]:
# Making copies to maintain original dataset
X = dataset_df.copy()
y = X.Transported
X_test = test_df.copy()

# Explore Data
Took majority from starter code notebook, slightly modified for my copied datasets.

In [None]:
X.head(5)

In [None]:
X.describe()

In [None]:
X.info()

In [None]:
plot_df = X.Transported.value_counts()
plot_df.plot(kind="bar")

The above graph reveals the target is balanced.

# Numerical data distribution
Plot all numerical columns with value counts

In [None]:
fig, ax = plt.subplots(5,1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)

sns.histplot(X['Age'], color='b', bins=50, ax=ax[0]);
sns.histplot(X['FoodCourt'], color='b', bins=50, ax=ax[1]);
sns.histplot(X['ShoppingMall'], color='b', bins=50, ax=ax[2]);
sns.histplot(X['Spa'], color='b', bins=50, ax=ax[3]);
sns.histplot(X['VRDeck'], color='b', bins=50, ax=ax[4]);

# Prepare Data
Dropping unecessary columns, filling missing values with 0, and converting boolean fields to int due to lack of TF-DF support. 

There's no need to encode categorical variables as TF-DF handles them natively.

## Drop extraneous columns

In [None]:
X = X.drop(['PassengerId', 'Name'], axis=1)
# Display updated data
X.head(5)

## Impute missing values with 0
Initially imputed most numeric and boolean columns with 0 as shown in the provided starter project code however amended to only apply this boolean fields so TF-DF can handle the imputation of the numeric features natively as well.

In [None]:
# Show missing value counts
X.isnull().sum().sort_values(ascending=False)

In [None]:
X[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = X[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
X.isnull().sum().sort_values(ascending=False)

## Convert boolean fields to int for TF compatibility

In [None]:
label = "Transported"
X[label] = X[label].astype(int)

In [None]:
X['VIP'] = X['VIP'].astype(int)
X['CryoSleep'] = X['CryoSleep'].astype(int)

## Simple feature engineering
Replacing Cabin with individual features deck, Cabin number, and side. 

Then removing unecessary Cabin feature from dataset.

Will do more local feature engineering after further EDA.

In [None]:
X[["Deck", "Cabin_num", "Side"]] = X["Cabin"].str.split("/", expand=True)

In [None]:
try:
    X = X.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

In [None]:
# Redisplay updated data
X.head(5)

## Visualize rest of column distributions

In [None]:
# Visualize numeric training data with Seaborn Scatterplot Matrix
sns.set_theme(style="ticks")
sns.pairplot(X, hue=label)

Not removing outliers or duplicates as this may affect results.

# Profile with sweetviz to explore variables w.r.t. target

In [None]:
# Also profiling to understand categorical variables better and determine local feature engineering steps
my_report = sv.analyze(X, label)
my_report.show_notebook()

## Profile with Pandas profiling to further explore correlations

In [None]:
ProfileReport(X, explorative=True)

# Using insights from above profiling reports to inform:
- the rest of my manual EDA
- local feature engineering steps to come.
    - Could create new AgeGroups feature
    - LuxuryAmenities feature: RoomService, FoodCourt, ShoppingMall, VRDeck costs and/or VIP status

## Explore relationship between CryoSleep and Cabin w.r.t. Target
The question: why were 82% of passengers in cryo transported vs the 32% that were not in cryo? Especially as passengers not in cryo were the majority, at 65%. Any correlations with other features?

Instead of exploring this directly I'm going to instead investigate correlated variables i.e. Deck and HomePlanet below:

## Visualizing relationship between Deck and HomePlanet w.r.t. Target - Strongest correlation
 High correlation with Deck and HomePlanet as seen in profiling reports - could also help answer above question about Cryosleep and Transported.

In [None]:
# Visualize Deck w.r.t. Target
sns.histplot(data=X, x="Deck", hue=label)

In [None]:
# Visualize HomePlanet w.r.t. Target
sns.set(style="darkgrid")
sns.histplot(data=X, x="HomePlanet", hue=label)

In [None]:
# Visualize both Deck and HomePlanet w.r.t. Target
g= sns.FacetGrid(X, col='Deck', hue=label)
g.map_dataframe(sns.histplot, x='HomePlanet')

## Visualizing relationship between Cryosleep and Luxury Amenities w.r.t. Target
Luxury amenities being VIP status, as well as RoomService, FoodCourt, ShoppingMall, VRDeck costs

In [None]:
# Create new LuxuryAmenities feature

# Visualizing Age (and/or AgeGroups w.r.t Target
Even though the majority of passengers fell within the middle age range (highest of which were 20s and 30s) the probability of getting transported had a negative correlation with Age as in the youngest were most likely to be transported whereas the seniors were least likely.

In [None]:
# Could create new AgeGroups feature -- just use Age for nwo

## Visualizing relationship between Destination and Target
Although most passengers (69%) were traveling to TRAPPIST-1e, there was a higher transported rate, of 61%, for those going to 55 Cancri e - which constituted only 21% of passengers. Figure out why this is.

## Visualizing relationships between Deck and Side w.r.t. Target
- Deck shows most passengers on B and C decks were transported vs much lower transported rates for all other decks, even though the majority fo people stayed in F and G decks. 
- Side is balanced, 50/50 split for passengers on Starboard and Port sides, although there is a slightly higher transported rate for S side. 

# Splitting dataset for training and evaluation

In [None]:
# Splitting 20% of training set into additional validation set before local feature engineering steps
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.20, random_state=1)

# Making training and validation sets compatible with Tensorflow

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_X, label=label)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_X, label=label)