# Overview

In this notebook, we will train a baseline Random Forest model using tensorflow Decision Forests on the Spaceship Titanic.

In [None]:
import os
import warnings

warnings.filterwarnings('ignore')

os.environ['train']='/kaggle/input/spaceship-titanic/test.csv'
os.environ['test']='/kaggle/input/spaceship-titanic/train.csv'
os.environ['submission']='/kaggle/input/spaceship-titanic/sample_submission.csv'

# Loading the Data

In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f'% x)

train=pd.read_csv(os.getenv('train'))
test=pd.read_csv(os.getenv('test'))
df=pd.concat([train, test], ignore_index=True).reset_index(drop=True)
df.info()

# Basic Exploration of the Dataset

In [None]:
df.describe()

In [None]:
def check_df(df):
    print("############# Shape #############")
    print(df.shape)
    print("############# Types #############")
    print(df.dtypes)
    print("############# Head #############")
    print(df.head(3))
    print("############# Tail #############")
    print(df.tail(3))
    print("############# NA #############")
    print(df.isnull().sum())
    print("############# Quantiles #############")
    numeric_columns=df.select_dtypes(include=['number']).columns
    # return values at the given quantile over requested axis
    print(df[numeric_columns].quantile([0,0.05, 0.50,0.95,0.99],1).T)

check_df(df)

# Visualization Dataset

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols

_, _, num_cols=grab_col_names(df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr=df[num_cols].corr()
sns.set(rc={'figure.figsize': (10,6)})
sns.heatmap(corr, cmap="RdBu", annot=True, fmt=".2f")
plt.show()

## Bar Char for Column: Transported

In [None]:
plot_df=df.Transported.value_counts()
plot_df.plot(kind="bar")

## Numerical Data Distribution

In [None]:
fig,ax=plt.subplots(5,1, figsize=(10,10))
plt.subplots_adjust(top=2)

sns.histplot(df['Age'], color='b', bins=50, ax=ax[0])
sns.histplot(df['FoodCourt'], color='b', bins=50, ax=ax[1])
sns.histplot(df['ShoppingMall'], color='b', bins=50, ax=ax[2])
sns.histplot(df['Spa'], color='b', bins=50, ax=ax[3])
sns.histplot(df['VRDeck'], color='b', bins=50, ax=ax[4])

# Example for Decision Forests

Decision Forests are a family of tree-based models including Random Forests and Gradient Boosted Trees. They are the best place to start when working with tabular data, and will often outperform (or provide a strong baseline) before you begin experimenting with neural networks. Roughly, the code will look as follows:

```python
import tensorflow_decision_forests as tfdf
import pandas as pd

dataset=pd.read_csv("dataset.csv")
tf_dataset=tfdf.keras.od_dataframe_to_tf_dataset(dataset, label="my_label")

model=tfdf.keras.RandomForestModel()
model.fit(tf_dataset)

print(model.summary())
```

# Prepare the Dataset

We will drop both `PassengerId` and `Name` columns as they are not necessary for model training. And we also check for the missing values using the internal function of pandas. This dataset contains a mix of numeric, categorical and missing features. Tensorflow Decision Forests supports all these feature types natively, and no preprocessing is required. But this dataset also has boolean feilds with missing values. TF-DF doesn't support boolean fileds yet. So we need to convert those fileds into int. To account for the missing values in the boolean fileds, we will replace them with zero.

In [None]:
df=df.drop(['PassengerId', 'Name'], axis=1)
df.isnull().sum().sort_values(ascending=False)

We will replace null values entries with zero for numerical columns as well and only let TF-DF handle the missing values in categorical columns. Since TF-DF cannot handle boolean columns, we will have to adjust the labels in columns `Transported`, `VIP` and `CryoSleep` to convert them into the integer format that TF-DF expects.

In [None]:
df[['VIP', 'CryoSleep','FoodCourt','ShoppingMall', 'Spa', 'VRDeck']]=df[['VIP','CryoSleep','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(value=0)
df.isnull().sum().sort_values(ascending=False)

In [None]:
df['Transported']=df['Transported'].astype(int)
df['VIP']=df['VIP'].astype(int)
df['CryoSleep']=df['CrypSleep'].astype(int)