In [139]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

In [140]:
# Load dataset
titanic_train = pd.read_csv('train.csv', index_col='PassengerId')
titanic_test = pd.read_csv('test.csv', index_col='PassengerId')

In [141]:
# Fill missing data
titanic_train[['HomePlanet', 'Destination']] = titanic_train[['HomePlanet', 'Destination']].fillna('Missing')
titanic_train['Cabin'] = titanic_train['Cabin'].fillna('Missing/0/Missing')

In [142]:
# Split cabin column into three
titanic_train[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = titanic_train['Cabin'].str.split('/', expand=True)

In [143]:
# Get new columns, based on its values
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['HomePlanet'], prefix='HomePlanet'))
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Destination'], prefix='Destination'))
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Cabin_deck'], prefix='Cabin_deck'))
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Cabin_side'], prefix='Cabin_side'))
# Change type of cabin numbers column
titanic_train['Cabin_num'] = titanic_train['Cabin_num'].astype('int64')
# Change all boolean type to int
titanic_train['CryoSleep'] = np.where(titanic_train['CryoSleep']==True, 1, 0)
titanic_train['VIP'] = np.where(titanic_train['VIP']==True, 1, 0)
titanic_train['Transported'] = np.where(titanic_train['Transported']==True, 1, 0)

In [144]:
# Show firs 5 rows
titanic_test.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [145]:
# Check how many data is missing
titanic_train.isnull().sum()

HomePlanet                     0
CryoSleep                      0
Cabin                          0
Destination                    0
Age                          179
VIP                            0
RoomService                  181
FoodCourt                    183
ShoppingMall                 208
Spa                          183
VRDeck                       188
Name                         200
Transported                    0
Cabin_deck                     0
Cabin_num                      0
Cabin_side                     0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
HomePlanet_Missing             0
Destination_55 Cancri e        0
Destination_Missing            0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
Cabin_deck_A                   0
Cabin_deck_B                   0
Cabin_deck_C                   0
Cabin_deck_D                   0
Cabin_deck_E                   0
Cabin_deck_F                   0
Cabin_deck

In [146]:
# Fill missing data with zeros
titanic_train[['Cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = titanic_train[['Cabin_num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
# Fill missing data with mean value
titanic_train['Age'] = titanic_train['Age'].fillna(titanic_train['Age'].mean())

In [147]:
# Get X and y
X = titanic_train.drop(['HomePlanet', 'Cabin', 'Destination', 'Name', 'Cabin_deck', 'Cabin_side', 'Transported'], axis=1)
y = titanic_train['Transported']

In [148]:
# Split training data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [155]:
# Initalize XBboost model and fit it
xgb_model = XGBClassifier(objective='binary:logistic', n_jobs=8, random_state=0, n_estimators=95, max_depth=10,
learning_rate=0.05, early_stopping_rounds=7).fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [156]:
# Check models accuracy
accuracy_score(y_test, xgb_model.predict(X_test))

0.8044614848379226