In [170]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import xgboost as xgb
import catboost as cb

Firstly, we will need to load the available data.

In [163]:
training_data = pd.read_csv('../data/train.csv')
testing_data = pd.read_csv('../data/test.csv')

Then we will have to explore the available data, so that we can better determine how each feature is related to the final result. Firstly let's view some of the data in the training set, and then we can use plots to visualise our data.

In [164]:
training_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


For the passenger id, we now that the first 4 digits, are the number of the group they are in, and the last 2 their place in the group. We can extract that data, as GroupId, GroupPosition and GroupSize.

In [165]:
training_data['GroupId'] = training_data['PassengerId'].str.split('_').str[0]
training_data['GroupPosition'] = training_data['PassengerId'].str.split('_').str[1]

testing_data['GroupId'] = testing_data['PassengerId'].str.split('_').str[0]
testing_data['GroupPosition'] = testing_data['PassengerId'].str.split('_').str[1]

training_data['GroupId'] = training_data['GroupId'].astype(int)
training_data['GroupPosition'] = training_data['GroupPosition'].astype(int)

testing_data['GroupId'] = testing_data['GroupId'].astype(int)
testing_data['GroupPosition'] = testing_data['GroupPosition'].astype(int)

training_data['GroupSize'] = (
    training_data.groupby('GroupId')['GroupId'].transform('size')
)

testing_data['GroupSize'] = (
    testing_data.groupby('GroupId')['GroupId'].transform('size')
)


training_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupPosition,GroupSize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,1,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,1,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,2,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,1,1


For the feature cabin, we know that the data has the format: deck/num/side, where the deck can take values: S -> Standard, P -> Port. We will also need to replace these feature with the 3 individual ones.

In [166]:
training_data[['Deck', 'CabinNumber', 'CabinSide']] = (
    training_data['Cabin'].str.split('/', expand=True)
)

testing_data[['Deck', 'CabinNumber', 'CabinSide']] = (
    testing_data['Cabin'].str.split('/', expand=True)
)

training_data['CabinNumber'] = pd.to_numeric(training_data['CabinNumber'], errors='coerce')
testing_data['CabinNumber'] = pd.to_numeric(testing_data['CabinNumber'], errors='coerce')

training_data.drop(['Cabin', 'Name'], axis=1, inplace=True)
testing_data.drop(['Cabin', 'Name'], axis=1, inplace=True)


So, now we can separate the data by their data types, We have boolean, numeric and categorical data. We need to make those distinctions so that we can better handle them when we are trying to fill the missing values. Some models also do not do well or work with at all, with categorical data, so we can probably separate those categories into new columns that hold boolean values (0 / 1, True / False). For the numeric data we will need to fill the missing data with either the median or mean value of the data.

In [167]:
y_train = training_data['Transported']
training_data.drop(columns=['Transported', 'PassengerId'], inplace=True)
passengersId = testing_data['PassengerId']
testing_data.drop(columns=['PassengerId'], inplace=True)

numeric_features = training_data.select_dtypes(include=np.number).columns
categorical_features = training_data.select_dtypes(exclude=['number', 'bool']).columns

median = training_data[numeric_features].median()

training_data[numeric_features] = training_data[numeric_features].fillna(median)
testing_data[numeric_features] = testing_data[numeric_features].fillna(median)

training_data = pd.get_dummies(training_data, columns=categorical_features)
testing_data = pd.get_dummies(testing_data, columns=categorical_features)
training_data.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GroupPosition,GroupSize,CabinNumber,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,CabinSide_P,CabinSide_S
0,39.0,0.0,0.0,0.0,0.0,0.0,1,1,1,0.0,...,False,True,False,False,False,False,False,False,True,False
1,24.0,109.0,9.0,25.0,549.0,44.0,2,1,1,0.0,...,False,False,False,False,False,True,False,False,False,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,3,1,2,0.0,...,True,False,False,False,False,False,False,False,False,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,3,2,2,0.0,...,True,False,False,False,False,False,False,False,False,True
4,16.0,303.0,70.0,151.0,565.0,2.0,4,1,1,1.0,...,False,False,False,False,False,True,False,False,False,True


Now, it is time to realign the tables / data frames and train the model:

In [168]:
training_data, testing_data = training_data.align(testing_data, join='left', axis=1, fill_value=0)

xgb_model = xgb.XGBClassifier(n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss')
xgb_model.fit(training_data, y_train)
y_pred = xgb_model.predict(testing_data)

submission = pd.DataFrame({
    'PassengerId': passengersId,
    'Transported': y_pred.astype(bool)
})

submission.to_csv('../data/XGBPredictions.csv', index=False)

We can also try using the CatBoostClassifier:

In [173]:
cbc_model = cb.CatBoostClassifier()
cbc_model.fit(training_data, y_train)
y_cbc_pred = cbc_model.predict(testing_data)

submission = pd.DataFrame({
    'PassengerId': passengersId,
    'Transported': y_cbc_pred.astype(bool)
})

submission.to_csv('../data/CatBoostPredictions.csv', index=False)

Learning rate set to 0.025939
0:	learn: 0.6808412	total: 140ms	remaining: 2m 19s
1:	learn: 0.6677921	total: 147ms	remaining: 1m 13s
2:	learn: 0.6565895	total: 151ms	remaining: 50.2s
3:	learn: 0.6461379	total: 155ms	remaining: 38.7s
4:	learn: 0.6365815	total: 160ms	remaining: 31.8s
5:	learn: 0.6273867	total: 165ms	remaining: 27.3s
6:	learn: 0.6195413	total: 173ms	remaining: 24.5s
7:	learn: 0.6096086	total: 178ms	remaining: 22s
8:	learn: 0.6028934	total: 183ms	remaining: 20.2s
9:	learn: 0.5944975	total: 188ms	remaining: 18.6s
10:	learn: 0.5875693	total: 193ms	remaining: 17.4s
11:	learn: 0.5812405	total: 198ms	remaining: 16.3s
12:	learn: 0.5754222	total: 204ms	remaining: 15.5s
13:	learn: 0.5698538	total: 208ms	remaining: 14.7s
14:	learn: 0.5637680	total: 214ms	remaining: 14.1s
15:	learn: 0.5580840	total: 220ms	remaining: 13.6s
16:	learn: 0.5529259	total: 226ms	remaining: 13.1s
17:	learn: 0.5487979	total: 231ms	remaining: 12.6s
18:	learn: 0.5444587	total: 236ms	remaining: 12.2s
19:	learn: 