#Spaceship Titanic Project

[Spaceship Titanic Kaggle](https://www.kaggle.com/competitions/spaceship-titanic)

##Exploratory Data Analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [None]:
from google.colab import drive
drive.mount("/content/drive")

df_train = pd.read_csv("/content/drive/MyDrive/BLAST AI/train.csv", index_col = 0)

df_test = pd.read_csv("/content/drive/MyDrive/BLAST AI/test.csv", index_col = 0)

MessageError: ignored

In [None]:
print("TRAIN SET")
df_train

In [None]:
print("TEST SET")
df_test

In [None]:
df_train.info()

all columns contain null values except "Transported"

In [None]:
df_train.describe()

In [None]:
df_test.info()

In [None]:
#distribution of train set transported column
df_train['Transported'].value_counts()

In [None]:
#unique home planet categories
df_train['HomePlanet'].unique()

In [None]:
#number of unique values for each column
df_train.nunique()

In [None]:
#train set data types
df_train.dtypes

###Data Visualization

In [None]:
plt.figure(figsize=(3,3))
df_train['Transported'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Target distribution")

There is a roughly 50-50 split between true and false outcomes for the transported column of the training data

In [None]:
plt.figure(figsize = (10,5))
sns.histplot(x = df_train["Age"],hue="Transported",data = df_train,kde=True,palette= "husl")
plt.title("Age Feature Distribution");

The age distribution in the training data is roughly right skewed. Between the ages of 0 and 18, it appears that there is a higher chance of being transported compared to the ages over 18

In [None]:
cols = ["HomePlanet","CryoSleep","Destination","VIP"]

plt.figure(figsize=(4,10))
for idx,column in enumerate(cols):
    plt.subplot(4,1,idx+1)
    sns.countplot(x=column, hue="Transported", data=df_train, palette="Paired")
    plt.title(f"{column} Distribution")
    plt.tight_layout()

For the home planet Mars, there is a close to equal likelihood of being transported or not transported. It appears that people who elect to go into CryoSleep have a higher likelihood of being transported.The distribution of VIP vs. transported suggests that VIP is may not be that useful for predicting transported as the distribution of transported vs. not transported for both VIP and non-VIP are roughly equal. However, there could be some other confounding variables that contribute to this distribution

In [None]:
sns.heatmap(df_train.corr())

The correlation heatmap (only for numerical columns) suggests that food court may have a stronger correlation with "Transported" compared to other features




##Data Processing and Cleaning

In [None]:
#print number of null values
print("TRAIN SET NULL VALUES:\n")
df_train.isnull().sum().sort_values(ascending=False)

TRAIN SET NULL VALUES:



CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
Transported       0
dtype: int64

In [None]:
print("TEST SET NULL VALUES:\n")
df_test.isnull().sum().sort_values(ascending=False)

TEST SET NULL VALUES:



FoodCourt       106
Spa             101
Cabin           100
ShoppingMall     98
Name             94
CryoSleep        93
VIP              93
Destination      92
Age              91
HomePlanet       87
RoomService      82
VRDeck           80
dtype: int64

In [None]:
#drop 'Name' column
df_train.drop(columns = ['Name'],  inplace = True)
df_test.drop(columns = ['Name'],  inplace = True)

In [None]:
#split cabin by deck, side, and cabin numer
df_train[['Deck', 'CabinNum', 'Side']] = df_train['Cabin'].str.split("/", expand = True)
df_test[['Deck', 'CabinNum', 'Side']] = df_test['Cabin'].str.split("/", expand = True)

In [None]:
df_test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,CabinNum,Side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,S
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,S
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,S
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,S
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,1496,S
9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,,,
9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,D,296,P
9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,D,297,P


In [None]:
df_train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,CabinNum,Side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,98,P
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,1499,S
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,1500,S
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,608,S


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
 12  Deck          8494 non-null   object 
 13  CabinNum      8494 non-null   object 
 14  Side          8494 non-null   object 
dtypes: bool(1), float64(6), object(8)
memory usage: 1.3+ MB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4277 entries, 0013_01 to 9277_01
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4190 non-null   object 
 1   CryoSleep     4184 non-null   object 
 2   Cabin         4177 non-null   object 
 3   Destination   4185 non-null   object 
 4   Age           4186 non-null   float64
 5   VIP           4184 non-null   object 
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
 11  Deck          4177 non-null   object 
 12  CabinNum      4177 non-null   object 
 13  Side          4177 non-null   object 
dtypes: float64(6), object(8)
memory usage: 501.2+ KB


In [None]:
#drop 'Cabin' and 'CabinNum' columns because too many unique values
df_train.drop(['CabinNum', 'Cabin'], axis = 1, inplace = True)
df_test.drop(['CabinNum', 'Cabin'], axis = 1, inplace = True)

In [None]:
df_train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P
9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S
9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S
9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S


In [None]:
#fill null values with median or mode
df_train['HomePlanet'].fillna(value='Earth', inplace=True)
df_train['CryoSleep'].fillna(value=False, inplace=True)
df_train['Destination'].fillna(value=df_train.Destination.mode(), inplace=True)
df_train['Age'].fillna(value=df_train.Age.median(), inplace=True)
df_train['VIP'].fillna(value=False, inplace=True)
df_train['RoomService'].fillna(value=df_train.RoomService.median(), inplace=True)
df_train['FoodCourt'].fillna(value=df_train.FoodCourt.median(), inplace=True)
df_train['ShoppingMall'].fillna(value=df_train.ShoppingMall.median(), inplace=True)
df_train['Spa'].fillna(value=df_train.Spa.median(), inplace=True)
df_train['VRDeck'].fillna(value=df_train.VRDeck.median(), inplace=True)
df_train['Deck'].fillna(value=df_train.Deck.mode(), inplace=True)
df_train['Side'].fillna(value=df_train.Side.mode(), inplace=True)

In [None]:
df_test['HomePlanet'].fillna(value='Earth', inplace=True)
df_test['CryoSleep'].fillna(value=False, inplace=True)
df_test['Destination'].fillna(value=df_train.Destination.mode(), inplace=True)
df_test['Age'].fillna(value=df_train.Age.median(), inplace=True)
df_test['VIP'].fillna(value=False, inplace=True)
df_test['RoomService'].fillna(value=df_train.RoomService.median(), inplace=True)
df_test['FoodCourt'].fillna(value=df_train.FoodCourt.median(), inplace=True)
df_test['ShoppingMall'].fillna(value=df_train.ShoppingMall.median(), inplace=True)
df_test['Spa'].fillna(value=df_train.Spa.median(), inplace=True)
df_test['VRDeck'].fillna(value=df_train.VRDeck.median(), inplace=True)
df_test['Deck'].fillna(value=df_test.Deck.mode(), inplace=True)
df_test['Side'].fillna(value=df_test.Side.mode(), inplace=True)

In [None]:
df_train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P
9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S
9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S
9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S


In [None]:
df_test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,S
9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,,
9271_01,Mars,True,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,D,P
9273_01,Europa,False,,27.0,False,0.0,2680.0,0.0,0.0,523.0,D,P


One-hot Encoding

https://www.geeksforgeeks.org/python-pandas-get_dummies-method/



In [None]:
#convert non numeric categorical columns to numeric using one-hot encoding
df_train = pd.get_dummies(df_train, columns=['HomePlanet', 'Destination', 'VIP', 'Side', 'Deck'])
df_test = pd.get_dummies(df_test, columns=['HomePlanet', 'Destination', 'VIP', 'Side', 'Deck'])

In [None]:
df_train

Unnamed: 0_level_0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,...,Side_P,Side_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,False,39.0,0.0,0.0,0.0,0.0,0.0,False,0,1,...,1,0,0,1,0,0,0,0,0,0
0002_01,False,24.0,109.0,9.0,25.0,549.0,44.0,True,1,0,...,0,1,0,0,0,0,0,1,0,0
0003_01,False,58.0,43.0,3576.0,0.0,6715.0,49.0,False,0,1,...,0,1,1,0,0,0,0,0,0,0
0003_02,False,33.0,0.0,1283.0,371.0,3329.0,193.0,False,0,1,...,0,1,1,0,0,0,0,0,0,0
0004_01,False,16.0,303.0,70.0,151.0,565.0,2.0,True,1,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,False,41.0,0.0,6819.0,0.0,1643.0,74.0,False,0,1,...,1,0,1,0,0,0,0,0,0,0
9278_01,True,18.0,0.0,0.0,0.0,0.0,0.0,False,1,0,...,0,1,0,0,0,0,0,0,1,0
9279_01,False,26.0,0.0,0.0,1872.0,1.0,0.0,True,1,0,...,0,1,0,0,0,0,0,0,1,0
9280_01,False,32.0,0.0,1049.0,0.0,353.0,3235.0,False,0,1,...,0,1,0,0,0,0,1,0,0,0


In [None]:
df_test

Unnamed: 0_level_0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Side_P,Side_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,True,27.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
0018_01,False,19.0,0.0,9.0,0.0,2823.0,0.0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
0019_01,True,31.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
0021_01,False,38.0,0.0,6652.0,0.0,181.0,585.0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
0023_01,False,20.0,10.0,0.0,635.0,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,True,34.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
9269_01,False,42.0,0.0,847.0,17.0,10.0,144.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9271_01,True,27.0,0.0,0.0,0.0,0.0,0.0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
9273_01,False,27.0,0.0,2680.0,0.0,0.0,523.0,0,1,0,...,1,0,0,0,0,1,0,0,0,0


In [None]:
#change T/F to 0/1
df_train["Transported"].replace({False:0,True:1},inplace=True)

In [None]:
df_train.nunique()

CryoSleep                       2
Age                            80
RoomService                  1273
FoodCourt                    1507
ShoppingMall                 1115
Spa                          1327
VRDeck                       1306
Transported                     2
HomePlanet_Earth                2
HomePlanet_Europa               2
HomePlanet_Mars                 2
Destination_55 Cancri e         2
Destination_PSO J318.5-22       2
Destination_TRAPPIST-1e         2
VIP_False                       2
VIP_True                        2
Side_P                          2
Side_S                          2
Deck_A                          2
Deck_B                          2
Deck_C                          2
Deck_D                          2
Deck_E                          2
Deck_F                          2
Deck_G                          2
Deck_T                          2
dtype: int64

In [None]:
df_test.nunique()

CryoSleep                      2
Age                           79
RoomService                  842
FoodCourt                    902
ShoppingMall                 715
Spa                          833
VRDeck                       796
HomePlanet_Earth               2
HomePlanet_Europa              2
HomePlanet_Mars                2
Destination_55 Cancri e        2
Destination_PSO J318.5-22      2
Destination_TRAPPIST-1e        2
VIP_False                      2
VIP_True                       2
Side_P                         2
Side_S                         2
Deck_A                         2
Deck_B                         2
Deck_C                         2
Deck_D                         2
Deck_E                         2
Deck_F                         2
Deck_G                         2
Deck_T                         2
dtype: int64

In [None]:
#no more null values!
df_train.isnull().sum().sort_values(ascending=False)

CryoSleep                    0
Age                          0
Deck_G                       0
Deck_F                       0
Deck_E                       0
Deck_D                       0
Deck_C                       0
Deck_B                       0
Deck_A                       0
Side_S                       0
Side_P                       0
VIP_True                     0
VIP_False                    0
Destination_TRAPPIST-1e      0
Destination_PSO J318.5-22    0
Destination_55 Cancri e      0
HomePlanet_Mars              0
HomePlanet_Europa            0
HomePlanet_Earth             0
Transported                  0
VRDeck                       0
Spa                          0
ShoppingMall                 0
FoodCourt                    0
RoomService                  0
Deck_T                       0
dtype: int64

In [None]:
df_test.isnull().sum().sort_values(ascending=False)

CryoSleep                    0
VIP_False                    0
Deck_G                       0
Deck_F                       0
Deck_E                       0
Deck_D                       0
Deck_C                       0
Deck_B                       0
Deck_A                       0
Side_S                       0
Side_P                       0
VIP_True                     0
Destination_TRAPPIST-1e      0
Age                          0
Destination_PSO J318.5-22    0
Destination_55 Cancri e      0
HomePlanet_Mars              0
HomePlanet_Europa            0
HomePlanet_Earth             0
VRDeck                       0
Spa                          0
ShoppingMall                 0
FoodCourt                    0
RoomService                  0
Deck_T                       0
dtype: int64

##Scale and Split Training Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X = df_train.drop("Transported", axis = 1)
y = df_train[["Transported"]]

In [None]:
#scale values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

test_scaled = scaler.fit_transform(df_test)
df_test_scaled = pd.DataFrame(test_scaled, columns = df_test.columns)

In [None]:
df_X_scaled

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Side_P,Side_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,-0.732770,0.711945,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,-1.111173,1.754795,-0.503664,...,1.032865,-0.986630,-0.174191,3.187347,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399
1,-0.732770,-0.334037,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,0.899950,-0.569867,-0.503664,...,-0.968181,1.013551,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.02399
2,-0.732770,2.036857,-0.268001,1.959998,-0.283579,5.695623,-0.219796,-1.111173,1.754795,-0.503664,...,-0.968181,1.013551,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399
3,-0.732770,0.293552,-0.333105,0.523010,0.336851,2.687176,-0.092818,-1.111173,1.754795,-0.503664,...,-0.968181,1.013551,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399
4,-0.732770,-0.891895,0.125652,-0.237159,-0.031059,0.231374,-0.261240,0.899950,-0.569867,-0.503664,...,-0.968181,1.013551,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.02399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,-0.732770,0.851410,-0.333105,3.992336,-0.283579,1.189173,-0.197751,-1.111173,1.754795,-0.503664,...,1.032865,-0.986630,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399
8689,1.364685,-0.752431,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.899950,-0.569867,-0.503664,...,-0.968181,1.013551,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,1.548235,-0.02399
8690,-0.732770,-0.194573,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,0.899950,-0.569867,-0.503664,...,-0.968181,1.013551,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,1.548235,-0.02399
8691,-0.732770,0.223820,-0.333105,0.376365,-0.283579,0.043013,2.589576,-1.111173,1.754795,-0.503664,...,-0.968181,1.013551,-0.174191,-0.313741,-0.30661,-0.241218,2.987225,-0.688215,-0.645897,-0.02399


In [None]:
#assign X to the scaled values
X = df_X_scaled

In [None]:
#split training  data - 70 : 30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

##Machine Learning Models

Train and fit Random Forest Classifier

In [None]:
#train and fit random forest model
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

NameError: ignored

In [None]:
rfc_pred = rfc.predict(X_test)

In [None]:
print(accuracy_score(y_test, rfc_pred))

In [None]:
print("RANDOM FOREST CLASSIFICATION REPORT:\n", classification_report(y_test, rfc_pred))

In [None]:
print("RANDOM FOREST CONFUSION MATRIX:\n", confusion_matrix(y_test, rfc_pred))

XGBoost

In [None]:
#XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
xgb_pred = xgb.predict(X_test)

In [None]:
print(accuracy_score(y_test, xgb_pred))

In [None]:
print("XGBOOST CLASSIFICATION REPORT:\n", classification_report(y_test, xgb_pred))

In [None]:
print("XGBOOST CONFUSION MATRIX:\n", confusion_matrix(y_test, xgb_pred))

Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
lr_pred = lr.predict(X_test)
print(accuracy_score(y_test, lr_pred))

In [None]:
print("LOGISTIC REGRESSION CLASSIFICATION REPORT:\n", classification_report(y_test, lr_pred))

In [None]:
print("LOGISTIC REGRESSION CONFUSION MATRIX:\n", confusion_matrix(y_test, lr_pred))

KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
knn_pred = knn.predict(X_test)
print(accuracy_score(y_test, knn_pred))

In [None]:
print("KNN CLASSIFICATION REPORT:\n", classification_report(y_test, knn_pred))

##Hyperparameter Fine Tuning

Using Bayesian Optimization and hyperparameters that I googled through model documentation

Random Forest Fine Tuning

In [None]:
pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from skopt import BayesSearchCV

In [None]:
search_space = {
    'max_depth' : (1, 200),
    'min_samples_split' : (1, 10),
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'n_estimators' : (80, 550),
    'min_samples_leaf' : (1, 10),
}

In [None]:
opt = BayesSearchCV(
    estimator = rfc,
    search_spaces = search_space,
    scoring = 'neg_mean_squared_error', #Bayes Search minimizes mean squared error to get best params
    n_iter = 10,
    random_state = 100,
    verbose = 1,
)

In [None]:
opt.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


In [None]:
bay_preds = opt.predict(X_test)

In [None]:
opt.best_params_

OrderedDict([('criterion', 'gini'),
             ('max_depth', 14),
             ('min_samples_leaf', 8),
             ('min_samples_split', 7),
             ('n_estimators', 278)])

In [None]:
print(accuracy_score(y_test, bay_preds))

0.8131109833237493


In [None]:
#fit rfc model with best params
rfc.set_params(**opt.best_params_)

Logistic Regression Fine Tuning

In [None]:
search_space = {
    'C' : (0, 100),
    'multi_class' : ['auto', 'ovr', 'multinomial'],
    'max_iter' : (80, 2000),
}

In [None]:
opt = BayesSearchCV(
    estimator = lr,
    search_spaces = search_space,
    scoring = 'neg_mean_squared_error', #Bayes Search minimizes mean squared error to get best params
    n_iter = 10,
    random_state = 100,
    verbose = 1,
)

In [None]:
opt.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
bay_preds = opt.predict(X_test)

In [None]:
print(accuracy_score(y_test, bay_preds))

0.7952846463484762


In [None]:
#fit lr model with best params
lr.set_params(**opt.best_params_)

XGBoost Fine Tuning

In [None]:
search_space = {
    'max_depth' : (1, 200),
    'subsample' : (0, 1),
    'reg_lambda' : (0, 1),
    'reg_alpha' : (0, 200),
    'colsample_bytree' : (0, 1),
    'min_child_weight' : (0, 10),
    'n_estimators' : (80, 550),
    'max_delta_step' : (0, 500),
}

In [None]:
opt = BayesSearchCV(
    estimator = xgb,
    search_spaces = search_space,
    scoring = 'neg_mean_squared_error', #Bayes Search minimizes mean squared error to get best params
    n_iter = 10,
    random_state = 100,
    verbose = 1,
)

In [None]:
opt.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
bay_preds = opt.predict(X_test)

In [None]:
print(accuracy_score(y_test, bay_preds))

0.7935595169637722


In [None]:
#fit xgb model with best params
xgb.set_params(**opt.best_params_)

##Ensembling: Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
estimators = [('lr', lr), ('xgb', xgb)]

In [None]:
stcl = StackingClassifier(estimators = estimators, final_estimator = rfc, cv = 10)

In [None]:
#fit stacking classifier wtih XGBoost, Logistic Regression, and final estimator Random Forest
stcl.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [None]:
print(f"STACKING CLASSIFIER ACCURACY:  {stcl.score(X_train, y_train):0.2f}")

STACKING CLASSIFIER ACCURACY:  0.85


##Neural Networks

In [None]:
#pytorch
import torch
from torch import nn
import torch.optim as optim

In [None]:
X_train = torch.tensor(X_train.values, dtype=torch.float)
y_train = torch.tensor(y_train.values, dtype=torch.long)
X_test = torch.tensor(X_test.values, dtype=torch.float)
y_test = torch.tensor(y_test.values, dtype=torch.long)

y_train = torch.flatten(y_train)
y_test = torch.flatten(y_test)

In [None]:
#class  NeuralNetwork inherits from nn.Module
class NeuralNetwork(nn.Module):
  def __init__(self):
    super(NeuralNetwork, self).__init__()
    #hidden1 and hidden2 are instance variables
    #(input nodes, output nodes)
    self.hidden1 = nn.Linear(X_train.shape[1], 32)
    self.hidden2 = nn.Linear(32, 32)
    self.hidden3 = nn.Linear(32, 32)
    self.hidden4 = nn.Linear(32, 2)

  def forward(self, x):
    x = torch.relu(self.hidden1(x))
    x = self.hidden2(x)
    return x

In [None]:
model = NeuralNetwork()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 00.1)
print(model)

NeuralNetwork(
  (hidden1): Linear(in_features=25, out_features=32, bias=True)
  (hidden2): Linear(in_features=32, out_features=32, bias=True)
  (hidden3): Linear(in_features=32, out_features=32, bias=True)
  (hidden4): Linear(in_features=32, out_features=2, bias=True)
)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

In [None]:
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size = 72)

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f83ea8db700>

In [None]:
epochs = 500
loss_list = np.zeros((epochs,))

for epoch in range(epochs):
  running_loss = 0
  for inputs, targets in train_loader:
    optimizer.zero_grad() # reset optimizer gradients
    outputs = model(inputs) # forward pass data through nodes/layers
    loss = loss_fn(outputs, targets) # compute loss
    loss_list[epoch] = loss.item()
    running_loss += loss.item()
    loss.backward() # backprop
    optimizer.step() # update weighst

  if epoch % 10 == 0:
    print(f'Epoch [{epoch}/100], Loss: {running_loss/len(train_loader)}')

Epoch [0/100], Loss: 1.054935046855141
Epoch [10/100], Loss: 0.4121796415132635
Epoch [20/100], Loss: 0.4056164219098933
Epoch [30/100], Loss: 0.40189062427071964
Epoch [40/100], Loss: 0.39905804676168105
Epoch [50/100], Loss: 0.3967639842454125
Epoch [60/100], Loss: 0.3948203388382407
Epoch [70/100], Loss: 0.3933348280542037
Epoch [80/100], Loss: 0.3918896222815794
Epoch [90/100], Loss: 0.3908099493559669
Epoch [100/100], Loss: 0.3898110480869518
Epoch [110/100], Loss: 0.3888195023817175
Epoch [120/100], Loss: 0.38778084130848156
Epoch [130/100], Loss: 0.3867568605086383
Epoch [140/100], Loss: 0.3859422504901886
Epoch [150/100], Loss: 0.38505974587272196
Epoch [160/100], Loss: 0.38444137748549967
Epoch [170/100], Loss: 0.3837129978572621
Epoch [180/100], Loss: 0.3829592890599195
Epoch [190/100], Loss: 0.38226720199865455
Epoch [200/100], Loss: 0.38165616463212404
Epoch [210/100], Loss: 0.38095105921520905
Epoch [220/100], Loss: 0.38032556526801164
Epoch [230/100], Loss: 0.379637011009

In [None]:
with torch.no_grad():
  out = model(X_test)
  _, predicted = torch.max(out.data, 1)
  total = y_test.size(0)
  correct = (predicted == y_test).sum().item()
  print("Accuracy of the model based on X_test: {}%".format(100 * correct/total))

Accuracy of the model based on y_test: 77.2622699386503%


##Submission


In [None]:
df_test = torch.tensor(df_test.values)

TypeError: ignored

In [None]:
with torch.no_grad():
  out = model(df_test)
  _, predicted = torch.max(out.data, 1)
  total = y_test.size(0)
  correct = (predicted == y_test).sum().item()
  print("Accuracy of the model based on y_test: {}%".format(100 * correct/total))

TypeError: ignored

In [None]:
df_test['Transported'] = predicted

ValueError: ignored

In [None]:
predict_test = stcl.predict(df_test)
df_test['Transported'] = pd.Series(predict_test).map({0:False, 1:True}) # change 0/1 to F/T

AttributeError: ignored

In [None]:
df_test

Unnamed: 0_level_0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Side_P,Side_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0013_01,True,27.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
0018_01,False,19.0,0.0,9.0,0.0,2823.0,0.0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
0019_01,True,31.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
0021_01,False,38.0,0.0,6652.0,0.0,181.0,585.0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
0023_01,False,20.0,10.0,0.0,635.0,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,True,34.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
9269_01,False,42.0,0.0,847.0,17.0,10.0,144.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9271_01,True,27.0,0.0,0.0,0.0,0.0,0.0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
9273_01,False,27.0,0.0,2680.0,0.0,0.0,523.0,0,1,0,...,1,0,0,0,0,1,0,0,0,0


In [None]:
#drop everything except passenger ID
submission_df = df_test.drop(df_test.iloc[:-1], axis = 1)

In [None]:
submission_df['Transported'] = predict_test
submission_df['Transported'] = submission_df['Transported'].astype(bool)

NameError: ignored

In [None]:
submission_df

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,False
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,False
9269_01,False
9271_01,False
9273_01,False


In [None]:
submission_df.value_counts()

Transported
False          3555
True            722
dtype: int64

In [None]:
submission_df.to_csv('titanic_submission.csv')

My best accuracy score on the Kaggle competition (~73%) was using XGBoost alone without any hyperparameter tuning. I tried optimizing the performance through hyperparameter tuning, different models, stacking combinations, and different train-test-splits; however these modifications did not improve the
accuracy score.