In [None]:
# Weekly project, week of Sept 5

# Data science libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ML libraries
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier

# DL libraries

# Standard libraries
from pathlib import Path

### Base model

In [None]:
BASE_PATH = Path('/kaggle/input/spaceship-titanic')
dataset_df = pd.read_csv(BASE_PATH / 'train.csv')
dataset_df.describe(include='all')

In [None]:
dataset_df.info()

In [None]:
dataset_df.isnull().sum()

In [None]:
def split_dataset(dataset, test_ratio=0.2):
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

def preprocess(df):
    # Remove and reshape features
    df = df.drop(labels=['Name', 'PassengerId'], axis=1)
    
    df[['Deck', 'Cabin_num', 'Side']] = df['Cabin'].str.split("/", expand=True)
    df = df.drop('Cabin', axis=1)
    
    # Remove NaNs
    numerical_cols = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
    categorical_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']
    df[numerical_cols] = df[numerical_cols].fillna(value=0)
    
    modes = df[categorical_cols].mode()

    for column in modes:
        df[column].fillna(value=modes[column].iloc[0], inplace=True)
    
    # Converts columns to proper formats
    df[['CryoSleep', 'VIP', 'Cabin_num']] = df[['CryoSleep', 'VIP', 'Cabin_num']].astype(int)
    
    # Handle categorical variables
    return pd.get_dummies(df)
    
dataset_df = preprocess(dataset_df)
one_hot_encoded_training_predictors = pd.get_dummies(dataset_df)    
train_set, test_set = split_dataset(one_hot_encoded_training_predictors)
train_set_y = train_set['Transported'].astype(int)
train_set = train_set.drop(labels='Transported', axis=1)
test_set_y = test_set['Transported'].astype(int)
test_set = test_set.drop(labels='Transported', axis=1)

In [None]:
# Creating a random forest model from scikit-learn

random_forest = RandomForestClassifier(n_estimators=300, max_depth=16)
random_forest = random_forest.fit(train_set, train_set_y)

In [None]:
feature_names = [col for col in train_set.columns]
feature_importances = pd.Series(random_forest.feature_importances_, index=feature_names)
std = np.std([tree.feature_importances_ for tree in random_forest.estimators_], axis=0)
fig, ax = plt.subplots()
feature_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
# It seems Cabin number, RoomService, FoodCourt and other features (which predict economic status)
# is more important than other factors such as location-based features (home planet)
# Yet VIP has a very low importance, yet we have to remember that most values had False for this, so
# it isn't a very useful feature as far as we can tell

# Let's predict!

preds = random_forest.predict(test_set)
accuracy = (preds == test_set_y).sum() / len(preds)
accuracy

In [None]:
# Let's use this as a baseline model
final_test_df = pd.read_csv(BASE_PATH / 'test.csv')
final_test_df = preprocess(final_test_df)
one_hot_encoded_testing_predictors = pd.get_dummies(final_test_df)  
preds = random_forest.predict(one_hot_encoded_testing_predictors)
preds

#### Preparation of submission file

In [None]:
# sample_submissions = pd.read_csv(BASE_PATH / 'sample_submission.csv')
# sample_submissions['Transported'] = preds.astype('bool')
# sample_submissions.to_csv('/kaggle/working/submission.csv', index=False)

### TabularLearners from Fast.AI

In [None]:
try:
    dataset_df = pd.read_csv(BASE_PATH / 'train.csv')
    dataset_df = dataset_df.drop(columns=['PassengerId', 'Name'])
except:
    print("Already deleted useless columns")

In [None]:
dataset_df.info()

In [None]:
dataset_df[['CryoSleep', 'VIP', 'Transported']] = dataset_df[['CryoSleep', 'VIP', 'Transported']].astype('float64')

In [None]:
categorical = dataset_df.select_dtypes(include='object')
numerical = dataset_df.select_dtypes(include='float64').drop(columns='Transported')

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(dataset_df))
to = TabularPandas(dataset_df, procs=[Categorify, FillMissing, Normalize],
                   cat_names=list(categorical.columns),
                   cont_names=list(numerical.columns),
                   y_names='Transported',
                   y_block=CategoryBlock(),
                   splits=splits)

In [None]:
dls = to.dataloaders(bs=64)

In [None]:
dls.show_batch()

In [None]:
to.xs.info()

In [None]:
learn = tabular_learner(dls, layers=[10, 10, 10], metrics=error_rate, wd=0.1)

In [None]:
learn.fit_one_cycle(5)

In [None]:
learn.show_results()

In [None]:
# We are overfitting quite a lot, clearly! How can we fix this? Let's try feature engineering!