In [19]:
# Load all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

In [37]:
# Load dataset
titanic_train = pd.read_csv('train.csv', index_col='PassengerId')
titanic_test = pd.read_csv('test.csv', index_col='PassengerId')

In [21]:
# Fill missing data
titanic_train[['HomePlanet', 'Destination']] = titanic_train[['HomePlanet', 'Destination']].fillna('Missing')
titanic_train['Cabin'] = titanic_train['Cabin'].fillna('Missing/0/Missing')

In [22]:
# Split cabin column into three
titanic_train[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = titanic_train['Cabin'].str.split('/', expand=True)

In [23]:
# Get new columns, based on its values
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['HomePlanet'], prefix='HomePlanet'))
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Destination'], prefix='Destination'))
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Cabin_deck'], prefix='Cabin_deck'))
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Cabin_side'], prefix='Cabin_side'))
# Change type of cabin numbers column
titanic_train['Cabin_num'] = titanic_train['Cabin_num'].astype('int64')
# Change all boolean type to int
titanic_train['CryoSleep'] = np.where(titanic_train['CryoSleep']==True, 1, 0)
titanic_train['VIP'] = np.where(titanic_train['VIP']==True, 1, 0)
titanic_train['Transported'] = np.where(titanic_train['Transported']==True, 1, 0)

In [24]:
# Show firs 5 rows
titanic_train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_deck,Cabin_num,Cabin_side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Missing,Destination_55 Cancri e,Destination_Missing,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_Missing,Cabin_deck_T,Cabin_side_Missing,Cabin_side_P,Cabin_side_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0
0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1
0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1
0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1


In [25]:
# Check how many data is missing
titanic_train.isnull().sum()

HomePlanet                     0
CryoSleep                      0
Cabin                          0
Destination                    0
Age                          179
VIP                            0
RoomService                  181
FoodCourt                    183
ShoppingMall                 208
Spa                          183
VRDeck                       188
Name                         200
Transported                    0
Cabin_deck                     0
Cabin_num                      0
Cabin_side                     0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
HomePlanet_Missing             0
Destination_55 Cancri e        0
Destination_Missing            0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
Cabin_deck_A                   0
Cabin_deck_B                   0
Cabin_deck_C                   0
Cabin_deck_D                   0
Cabin_deck_E                   0
Cabin_deck_F                   0
Cabin_deck

In [26]:
# Fill missing data with zeros
titanic_train[['Cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = titanic_train[['Cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
# Fill missing data with mean value
titanic_train['Age'] = titanic_train['Age'].fillna(titanic_train['Age'].mean())

In [27]:
# Get X and y
X = titanic_train.drop(['HomePlanet', 'Cabin', 'Destination', 'Name', 'Cabin_deck', 'Cabin_side', 'Transported'], axis=1)
y = titanic_train['Transported']

In [28]:
# Split training data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [29]:
# Initalize XBboost model and fit it
xgb_model = XGBClassifier(objective='binary:logistic', n_jobs=8, random_state=0, n_estimators=95, max_depth=10,
learning_rate=0.05, early_stopping_rounds=7).fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [30]:
# Check models accuracy
accuracy_score(y_test, xgb_model.predict(X_test))

0.8044614848379226

In [38]:
# Fill missing data
titanic_test[['HomePlanet', 'Destination']] = titanic_test[['HomePlanet', 'Destination']].fillna('Missing')
titanic_test['Cabin'] = titanic_test['Cabin'].fillna('Missing/0/Missing')
# Split cabin column into three
titanic_test[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = titanic_test['Cabin'].str.split('/', expand=True)

In [39]:
# Get new columns, based on its values
titanic_test = titanic_test.join(pd.get_dummies(titanic_test['HomePlanet'], prefix='HomePlanet'))
titanic_test = titanic_test.join(pd.get_dummies(titanic_test['Destination'], prefix='Destination'))
titanic_test = titanic_test.join(pd.get_dummies(titanic_test['Cabin_deck'], prefix='Cabin_deck'))
titanic_test = titanic_test.join(pd.get_dummies(titanic_test['Cabin_side'], prefix='Cabin_side'))
# Change type of cabin numbers column
titanic_test['Cabin_num'] = titanic_test['Cabin_num'].astype('int64')
# Change all boolean type to int
titanic_test['CryoSleep'] = np.where(titanic_test['CryoSleep']==True, 1, 0)
titanic_test['VIP'] = np.where(titanic_test['VIP']==True, 1, 0)

In [40]:
# Fill missing data with zeros
titanic_test[['Cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = titanic_test[['Cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
# Fill missing data with mean value
titanic_test['Age'] = titanic_test['Age'].fillna(titanic_test['Age'].mean())

In [42]:
# Get X_val
X_val = titanic_test.drop(['HomePlanet', 'Cabin', 'Destination', 'Name', 'Cabin_deck', 'Cabin_side'], axis=1)

In [43]:
# Predict on test.csv
pred = xgb_model.predict(X_val)

In [46]:
# Read sapmle submission data
submission = pd.read_csv('sample_submission.csv')

In [48]:
# Set Transporteds column values to the predicted values
submission.Transported = pred

In [50]:
# Change Transporteds column values from int to boolean
submission.Transported = np.where(submission.Transported==1, True, False)

In [52]:
# Save submission into a .csv file
submission.to_csv('submissions.csv', header=True, index=False)