In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
import copy
from sklearn import svm

# Read data

In [33]:
train_data = pd.read_csv("spaceship-titanic/train.csv")
test_data = pd.read_csv("spaceship-titanic/test.csv")
example_submission = pd.read_csv("spaceship-titanic/sample_submission.csv")

train_labels = train_data["Transported"]
train_data = train_data.drop(columns=["Transported"], axis=1)

## Exploratory analysis

In [3]:
print(f"Train data length: {len(train_data)}")
print(f"Test data length: {len(test_data)}")

Train data length: 8693
Test data length: 4277


In [4]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [5]:
print(train_data.dtypes)

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
dtype: object


In [6]:
train_data[["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]].describe()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP
count,8492,8476,8494,8511,8490
unique,3,2,6560,3,2
top,Earth,False,G/734/S,TRAPPIST-1e,False
freq,4602,5439,8,5915,8291


In [7]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


## Encode labels and get dummies

In [8]:
train_data_fixed = copy.deepcopy(train_data)

In [9]:
train_data_fixed.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
dtype: int64

In [10]:
def split_cabin(df):
    for idx, row in df.iterrows():
        if pd.isnull(row["Cabin"]):
            df.at[idx, 'CabinDeck'] = np.NAN
            df.at[idx, 'CabinSide'] = np.NAN
        else:
            df.at[idx, 'CabinDeck'] = row["Cabin"].split("/")[0]
            df.at[idx, 'CabinSide'] = row["Cabin"].split("/")[2]
            
split_cabin(train_data_fixed)

In [11]:
train_data_fixed = train_data_fixed.drop(columns=["Cabin", "PassengerId", "Name"], axis=1)

In [12]:
train_data_dummies = pd.get_dummies(train_data_fixed, columns=["CryoSleep", "VIP", "HomePlanet", "Destination", "CabinDeck", "CabinSide"])

In [13]:
train_data_dummies.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CryoSleep_False,CryoSleep_True,VIP_False,VIP_True,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S
0,39.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,...,0,1,0,0,0,0,0,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,16.0,303.0,70.0,151.0,565.0,2.0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,1


## Decision Tree

Max score: 0.8124782154060648 for num_est: 25, max_depth: 5, learning rate: 0.3750000000000001

In [149]:
x_train, x_test, y_train, y_test = train_test_split(train_data_dummies, train_labels, test_size=0.33)

In [157]:
n_estimators = [3, 5, 10, 15, 20, 25, 50, 75, 100, 150, 200, 250, 300]
max_depth = [i for i in range(3, 16)]
learning_rate = [i for i in np.arange(0.05, 0.8, 0.025)]

max_n_est: int
max_m_depth: int
max_l_rate: float
max_score = np.NINF
best_tree: xgb.XGBClassifier
    
for n_est in n_estimators:
    for m_depth in max_depth:
        for l_rate in learning_rate:
            DecisionTree = xgb.XGBClassifier(
                eval_metric='logloss',
                n_estimators=n_est,
                max_depth=m_depth,
                learning_rate=l_rate,
            )
            DecisionTree.fit(x_train, y_train)
            score = DecisionTree.score(x_test, y_test)
            
            if score > max_score:
                max_n_est = n_est
                max_m_depth = m_depth
                max_l_rate = l_rate
                max_score = score
                best_tree = DecisionTree

print(f"Max score: {max_score} for num_est: {max_n_est}, max_depth: {max_m_depth}, learning rate: {max_l_rate}")

Max score: 0.8124782154060648 for num_est: 25, max_depth: 5, learning rate: 0.3750000000000001


## Random Forest

Max score: 0.780411293133496 for num_trees: 201

In [143]:
x_train, x_test, y_train, y_test = train_test_split(train_data_dummies, train_labels, test_size=0.33)

In [144]:
max_score, max_score_num_trees = np.NINF, np.NINF
best_forest: xgb.XGBRFClassifier

for num_trees in range(101, 1001, 20):
    RandomForest = xgb.XGBRFClassifier(
        n_estimators=num_trees,
        max_leaves=0,
        use_label_encoder=False,
        eval_metric='error',
    )
    RandomForest.fit(x_train, y_train)
    score = RandomForest.score(x_test, y_test)
    if score > max_score:
        max_score = score
        max_score_num_trees = num_trees
        best_forest = RandomForest
        
print(f"Max score: {max_score} for num_trees: {max_score_num_trees}")

Max score: 0.780411293133496 for num_trees: 201


## Scale data for K-nn

In [40]:
train_data_dummies_scaled = copy.deepcopy(train_data_dummies)

def divide_by_std(df, *columns):
    scaled_df = df
    for column in columns:
        scaled_df[column] = df[column] / df[column].std()
    return scaled_df

train_data_dummies_scaled = divide_by_std(train_data_dummies_scaled, 
                                          "Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck")

## K-nn

Max score: 0.7999302892994075 for knn_neighbors: 23

In [41]:
imputer = KNNImputer(n_neighbors=7)

train_data_dummies_scaled_for_knn = imputer.fit_transform(train_data_dummies_scaled)

In [42]:
x_train, x_test, y_train, y_test = train_test_split(train_data_dummies_scaled_for_knn, train_labels, test_size=0.33)

In [218]:
knn_neighbors = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 51, 75, 101, 125, 151]

max_knn_neigh = np.NINF
max_score = np.NINF
knn_classifier: KNeighborsClassifier()

for knn_neigh in knn_neighbors:   
        knn = KNeighborsClassifier(n_neighbors=knn_neigh)
        knn.fit(x_train, y_train)
        score = knn.score(x_test, y_test)

        if score > max_score:
            max_score = score
            max_knn_neigh = knn_neigh
            knn_classifier = knn
            
print(f"Max score: {max_score} for knn_neighbors: {max_knn_neigh}")

Max score: 0.7999302892994075 for knn_neighbors: 23


## SVC

SVC, Kernel: poly, score: 0.6158940397350994  
SVC, Kernel: rbf, score: 0.786336702683862  
SVC, Kernel: sigmoid, score: 0.7302195887068665  

In [14]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

train_data_dummies_nan_to_median = imputer.fit_transform(train_data_dummies)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(train_data_dummies_nan_to_median, train_labels, test_size=0.33)

In [16]:
kernels = ['poly', 'rbf', 'sigmoid']

for kernel in kernels:
    clf = svm.SVC(kernel=kernel)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print(f"SVC, Kernel: {kernel}, score: {score}")

SVC, Kernel: poly, score: 0.6158940397350994
SVC, Kernel: rbf, score: 0.786336702683862
SVC, Kernel: sigmoid, score: 0.7302195887068665


# Submit test data

In [16]:
test_data_fixed = copy.deepcopy(test_data)
split_cabin(test_data_fixed)
test_data_fixed = test_data_fixed.drop(columns=["Cabin", "PassengerId", "Name"], axis=1)
test_data_dummies = pd.get_dummies(test_data_fixed, columns=["CryoSleep", "VIP", "HomePlanet", "Destination", "CabinDeck", "CabinSide"])

### DecisionTree

In [30]:
DecisionTree = xgb.XGBClassifier(
    eval_metric='logloss',
    n_estimators=25,
    max_depth=5,
    learning_rate=0.375,
    use_label_encoder=False
)

In [35]:
DecisionTree.fit(train_data_dummies, train_labels)
res = DecisionTree.predict(test_data_dummies)

example_submission["Transported"] = res
example_submission = example_submission.astype({'Transported': bool})
example_submission.to_csv("submit_decision_tree.csv", index=False)

### RandomForest

In [38]:
RandomForest = xgb.XGBRFClassifier(
    n_estimators=201,
    max_leaves=0,
    use_label_encoder=False,
    eval_metric='error',
)
RandomForest.fit(train_data_dummies, train_labels)
res = RandomForest.predict(test_data_dummies)

example_submission["Transported"] = res
example_submission = example_submission.astype({'Transported': bool})
example_submission.to_csv("submit_random_forest.csv", index=False)

### K-nn

In [71]:
test_data_dummies_scaled = copy.deepcopy(test_data_dummies)

def divide_by_train_std(df_test, df_train, *columns):
    scaled_df = df_test
    for column in columns:
        scaled_df[column] = scaled_df[column] / df_train[column].std()
    return scaled_df

test_data_dummies_scaled = divide_by_train_std(test_data_dummies_scaled,
                                               train_data_dummies,
                                               "Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck")

imputer = KNNImputer(n_neighbors=7)
train_data_dummies_scaled_for_knn = imputer.fit_transform(train_data_dummies_scaled)

test_data_dummies_scaled_for_knn = imputer.transform(test_data_dummies_scaled)

In [72]:
knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(train_data_dummies_scaled_for_knn, train_labels)
res = knn.predict(test_data_dummies_scaled_for_knn)

example_submission["Transported"] = res
example_submission = example_submission.astype({'Transported': bool})
example_submission.to_csv("submit_knn.csv", index=False)

### SVC

In [78]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

train_data_dummies_nan_to_median = imputer.fit_transform(train_data_dummies)
test_data_dummies_nan_to_median = imputer.transform(test_data_dummies)

In [79]:
clf = svm.SVC(kernel='rbf')
clf.fit(train_data_dummies_nan_to_median, train_labels)
res = clf.predict(test_data_dummies_nan_to_median)

example_submission["Transported"] = res
example_submission = example_submission.astype({'Transported': bool})
example_submission.to_csv("submit_svc.csv", index=False)