# Spaceship Titanic

## Import

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Loading the Data

In [2]:
train = pd.read_csv("/Users/andrei/Downloads/spaceship-titanic/train.csv")

In [3]:
X = train.drop(columns = ["Transported", "PassengerId", "Name"])

In [4]:
y = train[["Transported"]]

In [5]:
y = y.Transported.astype(int)

In [6]:
y

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

## Preprocessing

### Imputing

In [7]:
X[['Deck','Room', 'Side']] = X.Cabin.str.split("/",expand=True)

In [8]:
X.drop(columns = ["Room"], inplace = True)

In [9]:
cat_imputer = SimpleImputer(strategy = "most_frequent")

In [10]:
age_imputer = SimpleImputer()

In [11]:
mean_imputer = SimpleImputer(fill_value = 0)

In [12]:
X[["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]] = cat_imputer.fit_transform(X[["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]])

In [13]:
X[["Age"]] = age_imputer.fit_transform(X[["Age"]])

In [14]:
X[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = mean_imputer.fit_transform(X[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]])

In [15]:
X.isna().sum()

HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Deck              0
Side              0
dtype: int64

### Encoding

In [16]:
X["CryoSleep"], X["VIP"] = X["CryoSleep"].astype(int), X["VIP"].astype(int)

In [17]:
X["isPort"] = np.where(X["Side"] == "P", 0, 1)

In [18]:
X.drop(columns = ["Side"], inplace = True)

In [19]:
X.drop(columns = ["Cabin"], inplace = True)

In [20]:
ohe = OneHotEncoder(sparse=False)

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(X[['HomePlanet','Destination','Deck']])
cols = ohe.get_feature_names_out(['HomePlanet','Destination','Deck'])
#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=X.index, columns=cols)

#Extract only the columns that didnt need to be encoded
data_other_cols = X.drop(columns=['HomePlanet','Destination','Deck'])

#Concatenate the two dataframes : 
X_ohe = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [21]:
X = X_ohe

### Normalization

In [22]:
import matplotlib.pyplot as plt

In [23]:
from sklearn.preprocessing import StandardScaler, RobustScaler

In [24]:
standard = StandardScaler()
robust = RobustScaler()

In [25]:
X["Age"] = standard.fit_transform(X[["Age"]])

In [26]:
X["RoomService"] = robust.fit_transform(X[["RoomService"]])

In [27]:
X["VRDeck"] = robust.fit_transform(X[["VRDeck"]])

In [28]:
X["FoodCourt"] = robust.fit_transform(X[["FoodCourt"]])

In [29]:
X["ShoppingMall"] = robust.fit_transform(X[["ShoppingMall"]])

In [30]:
X["Spa"] = robust.fit_transform(X[["Spa"]])

In [31]:
X.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,...,Deck_T,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,isPort
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0,0.709437,0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.336717,0,1.397436,0.076271,0.555556,6.168539,0.619718,1
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,2.034566,1,0.551282,30.305085,0.0,75.449438,0.690141,1
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,0.290975,0,0.0,10.872881,8.244444,37.404494,2.71831,1
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.894666,0,3.884615,0.59322,3.355556,6.348315,0.028169,1


### Split

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [33]:
X_train

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,...,Deck_T,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,isPort
2333,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.057743,0,0.000000,0.466102,0.000000,7.370787,0.000000,1
2589,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.824923,0,0.000000,10.127119,0.688889,0.000000,0.000000,0
8302,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1,-0.057743,0,0.000000,0.000000,0.000000,0.000000,0.000000,1
8177,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.615692,0,2.880610,0.016949,6.422222,10.966292,0.000000,0
500,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1,0.500206,0,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.755179,0,0.179487,0.016949,3.200000,6.853933,0.000000,1
5191,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,1.476617,0,8.846154,0.000000,0.666667,8.561798,6.028169,1
5390,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.476205,0,2.025641,0.000000,10.577778,0.000000,0.366197,0
860,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,0.360719,0,4.858974,0.000000,36.133333,0.000000,0.000000,0


## The Model

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [44]:
model = Sequential([
    Dense(10, input_shape = (23,), activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(1, activation = "sigmoid")
]
)

In [49]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [50]:
es = EarlyStopping(monitor = "val_accuracy", patience = 20, restore_best_weights = True)

In [51]:
model.fit(X_train, y_train, validation_data = [X_val, y_val], callbacks = es, batch_size = 32, epochs = 200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200


Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200


<keras.callbacks.History at 0x180dccbe0>

## Predicting

In [52]:
test = pd.read_csv("/Users/andrei/Downloads/spaceship-titanic/test.csv")

In [91]:
passenger_id = test[["PassengerId"]]

In [62]:
X_test = test.drop(columns = ["PassengerId", "Name"])

In [63]:
X_test[['Deck','Room', 'Side']] = X_test.Cabin.str.split("/",expand=True)

In [64]:
X_test.drop(columns = ["Room"], inplace = True)

In [65]:
X_test[["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]] = cat_imputer.fit_transform(X_test[["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]])

In [66]:
X_test[["Age"]] = age_imputer.fit_transform(X_test[["Age"]])

In [67]:
X_test[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = mean_imputer.fit_transform(X_test[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]])

In [68]:
X_test["CryoSleep"], X_test["VIP"] = X_test["CryoSleep"].astype(int), X_test["VIP"].astype(int)

In [69]:
X_test["isPort"] = np.where(X_test["Side"] == "P", 0, 1)

In [70]:
X_test.drop(columns = ["Side", "Cabin"], inplace = True)

In [71]:
ohe = OneHotEncoder(sparse=False)

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(X_test[['HomePlanet','Destination','Deck']])
cols = ohe.get_feature_names_out(['HomePlanet','Destination','Deck'])
#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=X_test.index, columns=cols)

#Extract only the columns that didnt need to be encoded
data_other_cols = X_test.drop(columns=['HomePlanet','Destination','Deck'])

#Concatenate the two dataframes : 
X_test = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [72]:
X_test["Age"] = standard.fit_transform(X_test[["Age"]])

In [73]:
X_test["RoomService"] = robust.fit_transform(X_test[["RoomService"]])

In [74]:
X_test["VRDeck"] = robust.fit_transform(X_test[["VRDeck"]])

In [75]:
X_test["FoodCourt"] = robust.fit_transform(X_test[["FoodCourt"]])

In [76]:
X_test["ShoppingMall"] = robust.fit_transform(X_test[["ShoppingMall"]])

In [77]:
X_test["Spa"] = robust.fit_transform(X_test[["Spa"]])

In [80]:
X_test

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,...,Deck_T,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,isPort
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1,-1.182216e-01,0,0.000000,0.000000,0.000000,0.000000,0.000000,1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-6.886014e-01,0,0.000000,0.062937,0.000000,34.012048,0.000000,1
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1,1.669682e-01,0,0.000000,0.000000,0.000000,0.000000,0.000000,1
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0,6.660505e-01,0,0.000000,46.517483,0.000000,2.180723,11.037736,1
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-6.173039e-01,0,0.126582,0.000000,12.450980,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1,3.808606e-01,0,0.000000,0.000000,0.000000,0.000000,0.000000,1
4273,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,9.512404e-01,0,0.000000,5.923077,0.333333,0.120482,2.716981,1
4274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1,-2.532995e-16,0,0.000000,0.000000,0.000000,0.000000,0.000000,0
4275,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0,-2.532995e-16,0,0.000000,18.741259,0.000000,0.000000,9.867925,0


In [78]:
model.predict(X_test)

array([[0.750672  ],
       [0.00581422],
       [0.99979657],
       ...,
       [0.9942739 ],
       [0.3538838 ],
       [0.5139967 ]], dtype=float32)

In [92]:
pred = pd.DataFrame(np.where(model.predict(X_test)<0.5, False, True))

In [95]:
submission = pd.concat([passenger_id, pred], axis = 1)

In [96]:
submission

Unnamed: 0,PassengerId,0
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [101]:
submission = submission.rename(columns = {0 : "Transported"})

In [107]:
submission.to_csv("/Users/andrei/Downloads/spaceship-titanic/submission.csv", header = True, index = False)