<a href="https://www.kaggle.com/cameron858/spaceship-titanic-various-models-fe-80?scriptVersionId=89298267" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Load training data into pandas dataframe

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
submission = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

print(f'{train.head()}\n{train.info()}')

# Initial EDA # 

## Dealing with NaNs ##

In [None]:
# examine Nans
print(f'Training NaNs:\n{train.isnull().sum()}\n\nTesting NaNs:\n{test.isnull().sum()}')
print(f'\nThe data contains {train.isnull().sum().sum() + test.isnull().sum().sum()} NaNs')

There are alot of NaNs in this dataset.

In [None]:
# class imbalance including NaNs

fig1, ax = plt.subplots(2, 2, figsize=(10, 10))
idx = 0

for row in [0, 1]:
    for col in [0, 1]:
        feat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP'][idx]
        train[feat].value_counts(dropna=False).plot(kind='pie', ax = ax[row][col])
        idx += 1

In [None]:
# distributions of numerical features
original_numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

fig2, ax = plt.subplots(2, 3, figsize=(10, 10))
train[original_numerical_features].hist(ax=ax)

The numerical features are very skewed (apart from Age)

We need to impute the missing values, the question is what method is best?
One way would be to impute with the most common value for each feature as such:
``` python
# extract columns with NaNs
contains_nans = train.columns[train.isnull().any()]

# fills with most common for all cols. Very basic.
for col in contains_nans:
    temp = train[col].value_counts().index[0]
    train[col] = train[col].fillna(temp)
    test[col] = test[col].fillna(temp)
```
However, we can attempt a slightly more foccused imputation for each feature.

In [None]:
# extract columns with NaNs
contains_nans = train.columns[train.isnull().any()]

# index removes transported to avod error when filling test data
for col in train.isnull().sum().index[0:-1]:
    # check if contains nans and is a float64
    if train[col].isnull().sum() > 0 and train[col].dtypes == 'float64':
        temp = train[col].median()
    else:
        # fill with most common value
        temp = train[col].value_counts().index[0]
        
    train[col] = train[col].fillna(temp)
    test[col] = test[col].fillna(temp)
    
print(f'Training NaNs:\n{train.isnull().sum()}\n\nTesting NaNs:\n{test.isnull().sum()}')
print(f'\nThe data contains {train.isnull().sum().sum() + test.isnull().sum().sum()} NaNs')

Lets again plot the distributions for the features

In [None]:
# class imbalance including NaNs

fig3, ax = plt.subplots(2, 2, figsize=(10, 10))
idx = 0

for row in [0, 1]:
    for col in [0, 1]:
        feat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP'][idx]
        train[feat].value_counts(dropna=False).plot(kind='pie', ax = ax[row][col])
        idx += 1

In [None]:
# distributions of numerical features
original_numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

fig4, ax = plt.subplots(2, 3, figsize=(10, 10))
train[original_numerical_features].hist(ax=ax)

<h1><center> Feature creation <center><h1>

Numerous features can be split into more features. The PassengerId has the format of 'XXXX_XX'. We can split the first 4 digits, and last 2 into seperate features. The cabin format is of "deck / number / side (port P or startboard S). We can seperate these into seperate columns. It is a valid assumption that families travel together, and stay in the same rooms. First and Last name features can be created from the original names feature. The 3 original features have been dropped afterwards.

In [None]:
# splitting PassengerId feature
train[['PassengerId_0', 'PassengerId_1']] = train['PassengerId'].str.split('_', 1, expand=True)
test[['PassengerId_0', 'PassengerId_1']] = test['PassengerId'].str.split('_', 1, expand=True)

# splitting Cabin feature
train[['Deck', 'Number', 'Side']] = train['Cabin'].str.split('/', 2, expand=True)
test[['Deck', 'Number', 'Side']] = test['Cabin'].str.split('/', 2, expand=True)

# splitting Name feature
train[['First name', 'Family name']] = train['Name'].str.split(' ', 1, expand=True)
test[['First name', 'Family name']] = test['Name'].str.split(' ', 1, expand=True)

# drop old features
train.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)
test.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)

# snacks 
train['Food/drink'] = train['RoomService'] + train['FoodCourt']
test['Food/drink'] = test['RoomService'] + test['FoodCourt']

# Entertainment
train['Entertainment'] = train['Spa'] + train['VRDeck']
test['Entertainment'] = test['Spa'] + test['VRDeck']

# Total spent
train['TotalSpent'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']
test['TotalSpent'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

# baller: CryoSleep = False and TotalSpent > TotalSpent.median()

# log transform of age
train['LogAge'] = (train['Age'] - train['Age'].min() + 1).transform(np.log)
test['LogAge'] = (test['Age'] - test['Age'].min() + 1).transform(np.log)

In [None]:
# an attempt to make a 'Class' feature wil be done
# WIP

print(f'Training:\n{train.head()}\nTesting:\n{test.head()}')

Scale continuous features, and impute missing values.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# scale continuous features
continuous_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Food/drink', 'Entertainment', 'TotalSpent', 'LogAge']
min_max_scaler = MinMaxScaler()
train[continuous_cols] = min_max_scaler.fit_transform(train[continuous_cols])
test[continuous_cols] = min_max_scaler.fit_transform(test[continuous_cols])

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_df_cols(df, columns):
    for col in columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype('str'))   
    return df

cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Number', 'Side', 'First name', 'Family name', 'PassengerId_0', 'PassengerId_1']
train = encode_df_cols(train, cat_cols)
test = encode_df_cols(test, cat_cols)

print(f'Training:\n{train.head()}\nTesting:\n{test.head()}')

In [None]:
# double check no NaNs exist
train.isnull().sum()

# Feature Selections #

This sections aims to evaluate each feature in relation to the target ('Transported'). This should assist our models decisions. The numerical and categorical features will be evaluated seperately.

<h2> Numerical Features <h2>

In [None]:
# list numerical features from previous section
continuous_cols

In [None]:
import seaborn as sns

fig, ax = plt.subplots(3, 3, figsize=(15, 15))
idx = 0

for row in range(0,3):
    for col in range(0,3):
        sns.kdeplot(data=train, x=continuous_cols[idx], hue="Transported", ax=ax[row][col])
        idx+=1

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

# astype(float) to avoid empty dataframe error
numerical_corrs = train[continuous_cols + ['Transported']].astype(float).corr()
sns.heatmap(numerical_corrs, annot=True, ax=ax)

In [None]:
# sort numerical features from largest to smallest abs correlation
print(numerical_corrs['Transported'].abs().sort_values(ascending=False))

## Categorical features ##

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
categorical_corrs = train[cat_cols + ['Transported']].corr()
sns.heatmap(categorical_corrs, annot=True)

In [None]:
# sort categorical features from largest to smallest abs correlation
print(categorical_corrs['Transported'].abs().sort_values(ascending=False))

## Dropping redundant features ##
# Sometimes less is more #

It is clear from above that multiple features provide almost no tangible input to the models.

In [None]:
# see total features corrs
total_corrs = pd.concat([numerical_corrs['Transported'], categorical_corrs['Transported']]).abs().sort_values(ascending=False)
print(total_corrs)

In [None]:
# remove the double transported value, and all rows less than 0.1
total_corrs[total_corrs > 0.1][1:]

In [None]:
train = train[total_corrs[total_corrs > 0.1][1:].keys()]
test = test[total_corrs[total_corrs > 0.1][2:].keys()]

print(f'Training:\n{train.head()}\nTesting:\n{test.head()}')

Split into train into features and target dfs

In [None]:
Y_train = train['Transported']
X_train = train.loc[:, train.columns != 'Transported']

# Models #

In [None]:
from sklearn.neural_network import MLPClassifier

neural_net = MLPClassifier(hidden_layer_sizes=(400, 200, 100, 20), early_stopping=True, validation_fraction=0.2, verbose=True)
neural_net.fit(X_train, Y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn_params = {'n_neighbors': [3, 5, 7]}
knn = GridSearchCV(KNeighborsClassifier(), knn_params, n_jobs=4, refit=True, verbose=1, return_train_score=True)
knn.fit(X_train, Y_train)
print(knn.best_score_)

In [None]:
from sklearn.svm import SVC

#svc_params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
#svc = GridSearchCV(SVC(), svc_params, n_jobs=4, verbose=2)
svc = SVC(kernel='rbf').fit(X_train, Y_train)

Initial SVC run scored 54%

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier

gpc = GaussianProcessClassifier().fit(X_train, Y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc_params = {'criterion': ['gini', 'entropy'], 'max_depth': range(1,10)}
clf_dtc = GridSearchCV(DecisionTreeClassifier(), dtc_params, n_jobs=4, verbose=1)
clf_dtc.fit(X_train, Y_train)
dtc = clf_dtc.best_estimator_
print(f'{clf_dtc.best_score_}\n{clf_dtc.best_estimator_}')

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_params = {'n_estimators': [100, 300, 500]}
clf = GridSearchCV(AdaBoostClassifier(base_estimator=clf_dtc.best_estimator_), ada_params, n_jobs=4, verbose=1)
clf.fit(X_train, Y_train)
ada = clf.best_estimator_
print(f'{clf.best_score_}\n{clf.best_estimator_}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_params = {'n_estimators': [100, 300, 500]}
clf = GridSearchCV(RandomForestClassifier(), rfc_params, n_jobs=4, verbose=1)
clf.fit(X_train, Y_train)
rfc = clf.best_estimator_
print(f'{clf.best_score_}\n{clf.best_estimator_}')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_params = {'loss': ['deviance', 'exponential'],
              'n_estimators': list(range(100, 301, 100)),
              'max_depth': [1, 3, 5]}
clf = GridSearchCV(GradientBoostingClassifier(), gbc_params, n_jobs=4, verbose=1)
clf.fit(X_train, Y_train)
gbc = clf.best_estimator_
print(f'{clf.best_score_}\n{clf.best_estimator_}')

In [None]:
def create_submission(model):
    submission['Transported'] = model.predict(test)
    submission.to_csv("/kaggle/working/submission.csv", index=False)
    
create_submission(gbc)