In [89]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

Firstly, we will need to load the available data.

In [90]:
training_data = pd.read_csv('../data/train.csv')
testing_data = pd.read_csv('../data/test.csv')

Then we will have to explore the available data, so that we can better determine how each feature is related to the final result. Firstly let's view some of the data in the training set, and then we can use plots to visualise our data.

In [91]:
training_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


For the passenger id, we now that the first 4 digits, are the number of the group they are in, and the last 2 their place in the group. We can extract that data, as GroupId, GroupPosition and GroupSize.

In [92]:
training_data['GroupId'] = training_data['PassengerId'].str.split('_').str[0]
training_data['GroupPosition'] = training_data['PassengerId'].str.split('_').str[1]

testing_data['GroupId'] = testing_data['PassengerId'].str.split('_').str[0]
testing_data['GroupPosition'] = testing_data['PassengerId'].str.split('_').str[1]

training_data['GroupId'] = training_data['GroupId'].astype(int)
training_data['GroupPosition'] = training_data['GroupPosition'].astype(int)

testing_data['GroupId'] = testing_data['GroupId'].astype(int)
testing_data['GroupPosition'] = testing_data['GroupPosition'].astype(int)


all_data = pd.concat([training_data, testing_data], axis=0, ignore_index=True)

all_data['GroupSize'] = (
    all_data.groupby('GroupId')['GroupId'].transform('size')
)

training_data['GroupSize'] = all_data.loc[training_data.index, 'GroupSize']
training_data['IsAlone'] = training_data['GroupSize'] == 1
training_data['IsPair'] = training_data['GroupSize'] == 2
training_data['IsInBigGroup'] = training_data['GroupSize'] >= 5


testing_data['GroupSize'] = all_data.loc[testing_data.index, 'GroupSize']
testing_data['IsAlone'] = testing_data['GroupSize'] == 1
testing_data['IsPair'] = testing_data['GroupSize'] == 2

training_data['IsAdult'] = training_data['Age'] >= 18

testing_data['IsAdult'] = testing_data['Age'] >= 18


training_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,GroupId,GroupPosition,GroupSize,IsAlone,IsPair,IsInBigGroup,IsAdult
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,Maham Ofracculy,False,1,1,1,True,False,False,True
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,44.0,Juanna Vines,True,2,1,1,True,False,False,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,49.0,Altark Susent,False,3,1,2,False,True,False,True
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,193.0,Solam Susent,False,3,2,2,False,True,False,True
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,2.0,Willy Santantines,True,4,1,1,True,False,False,False


We, will also try adding more columns about the spending habits of the people, as there may be a correlation to the spending habits and whether the passenger was transported:

For the feature cabin, we know that the data has the format: deck/num/side, where the deck can take values: S -> Standard, P -> Port. We will also need to replace these feature with the 3 individual ones.

In [93]:
training_data[['Deck', 'CabinNumber', 'CabinSide']] = (
    training_data['Cabin'].str.split('/', expand=True)
)

testing_data[['Deck', 'CabinNumber', 'CabinSide']] = (
    testing_data['Cabin'].str.split('/', expand=True)
)

training_data['CabinNumber'] = pd.to_numeric(training_data['CabinNumber'], errors='coerce')
testing_data['CabinNumber'] = pd.to_numeric(testing_data['CabinNumber'], errors='coerce')

training_data.drop(['Cabin', 'Name'], axis=1, inplace=True)
testing_data.drop(['Cabin', 'Name'], axis=1, inplace=True)


So, now we can separate the data by their data types, We have boolean, numeric and categorical data. We need to make those distinctions so that we can better handle them when we are trying to fill the missing values. Some models also do not do well or work with at all, with categorical data, so we can probably separate those categories into new columns that hold boolean values (0 / 1, True / False). For the numeric data we will need to fill the missing data with either the median or mean value of the data.

In [94]:
y_train = training_data['Transported']
training_data.drop(columns=['Transported', 'PassengerId'], inplace=True)
passengersId = testing_data['PassengerId']
testing_data.drop(columns=['PassengerId'], inplace=True)

numeric_features = training_data.select_dtypes(include=np.number).columns
categorical_features = training_data.select_dtypes(exclude=['number', 'bool']).columns

median = training_data[numeric_features].median()

training_data[numeric_features] = training_data[numeric_features].fillna(median)
testing_data[numeric_features] = testing_data[numeric_features].fillna(median)

training_data = pd.get_dummies(training_data, columns=categorical_features)
testing_data = pd.get_dummies(testing_data, columns=categorical_features)
training_data.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GroupPosition,GroupSize,IsAlone,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,CabinSide_P,CabinSide_S
0,39.0,0.0,0.0,0.0,0.0,0.0,1,1,1,True,...,False,True,False,False,False,False,False,False,True,False
1,24.0,109.0,9.0,25.0,549.0,44.0,2,1,1,True,...,False,False,False,False,False,True,False,False,False,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,3,1,2,False,...,True,False,False,False,False,False,False,False,False,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,3,2,2,False,...,True,False,False,False,False,False,False,False,False,True
4,16.0,303.0,70.0,151.0,565.0,2.0,4,1,1,True,...,False,False,False,False,False,True,False,False,False,True


Now, it is time to realign the tables / data frames and train the model:

In [95]:
training_data, testing_data = training_data.align(testing_data, join='left', axis=1, fill_value=0)

xgb_model = xgb.XGBClassifier(n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss')
xgb_model.fit(training_data, y_train)
y_pred = xgb_model.predict(testing_data)

submission = pd.DataFrame({
    'PassengerId': passengersId,
    'Transported': y_pred.astype(bool)
})

submission.to_csv('../data/XGBPredictions.csv', index=False)

We can also try using the CatBoostClassifier:

In [96]:
cbc_model = cb.CatBoostClassifier()
cbc_model.fit(training_data, y_train)
y_cbc_pred = cbc_model.predict(testing_data)

submission = pd.DataFrame({
    'PassengerId': passengersId,
    'Transported': y_cbc_pred.astype(bool)
})

submission.to_csv('../data/CatBoostPredictions.csv', index=False)

Learning rate set to 0.025939
0:	learn: 0.6785041	total: 93.4ms	remaining: 1m 33s
1:	learn: 0.6664506	total: 147ms	remaining: 1m 13s
2:	learn: 0.6542411	total: 185ms	remaining: 1m 1s
3:	learn: 0.6428043	total: 237ms	remaining: 59.1s
4:	learn: 0.6327823	total: 308ms	remaining: 1m 1s
5:	learn: 0.6253700	total: 389ms	remaining: 1m 4s
6:	learn: 0.6163431	total: 479ms	remaining: 1m 7s
7:	learn: 0.6076656	total: 568ms	remaining: 1m 10s
8:	learn: 0.5989613	total: 657ms	remaining: 1m 12s
9:	learn: 0.5916782	total: 747ms	remaining: 1m 13s
10:	learn: 0.5856399	total: 838ms	remaining: 1m 15s
11:	learn: 0.5803025	total: 924ms	remaining: 1m 16s
12:	learn: 0.5726401	total: 1.01s	remaining: 1m 16s
13:	learn: 0.5669426	total: 1.09s	remaining: 1m 16s
14:	learn: 0.5612070	total: 1.18s	remaining: 1m 17s
15:	learn: 0.5561615	total: 1.27s	remaining: 1m 17s
16:	learn: 0.5512697	total: 1.36s	remaining: 1m 18s
17:	learn: 0.5474002	total: 1.42s	remaining: 1m 17s
18:	learn: 0.5428183	total: 1.46s	remaining: 1m 

So, another method that could improve the success rate could be stacking. When using stacking we can use multiple different models to make predictions on the data, as the first prediction model, and then according to the performance of said models, we could use a simple linear regression model to make the final prediction. So basically what we are doing is training the first layer with the multiple models on the dataset, and then train the second / final layer on the predictions of the first layer. To avoid overfitting and generate data from the first layer to the second, we can actually use a k-fold cross validation method on the initial data. This is basically separating the initial dataset into k subsets, for those k subsets we perform k iteration, in each iteration we train each model on k-1 of the subsets, and use the other subset to generate predictions from the models. Then, the predictions we get from each model become the new dataset that we will use to train our much simpler final model on the next layer. The final step for stacking to be ready is to retrain / fit the layer 1 models, this time to the full dataset. To get the final result then, we feed the first layer the test set, generate the test set for he second layer and then get the final prediction result.

In [97]:
base_models = {
    'xgboost': xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, random_state=42),
    'lightgbm': lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, random_state=42, verbose=-1),
    'catboost': cb.CatBoostClassifier(iterations=300, verbose=False, random_state=42),
    'rf': RandomForestClassifier(n_estimators=300, random_state=42),
    'extra_trees': ExtraTreesClassifier(n_estimators=200, random_state=42),
    'gradient_boost': GradientBoostingClassifier(n_estimators=200, random_state=42),
    'logistic': LogisticRegression(max_iter=5000, random_state=42),
    'naive_bayes': GaussianNB()
}

Kfold = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

layer1_predictions = np.zeros((len(training_data), len(base_models)))

for fold_idx, (train_idx, test_idx) in enumerate(Kfold.split(training_data)):
    X_fold_train, X_fold_val = training_data.iloc[train_idx], training_data.iloc[test_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[test_idx]

    for model_idx, (name, model) in enumerate(base_models.items()):
        model.fit(X_fold_train, y_fold_train)
        if hasattr(model, 'predict_proba'):
            predictions = model.predict_proba(X_fold_val)[:, 1]
        else:
            predictions = model.predict(X_fold_val)

        layer1_predictions[test_idx, model_idx] = predictions

meta_model = LogisticRegression(max_iter=5000, random_state=42)
meta_model.fit(layer1_predictions, y_train)

for name, model in base_models.items():
    model.fit(training_data, y_train)

layer1_test_predictions = np.zeros((len(testing_data), len(base_models)))

for model_idx, (name, model) in enumerate(base_models.items()):
    if hasattr(model, 'predict_proba'):
        layer1_test_predictions[:, model_idx] = model.predict_proba(testing_data)[:, 1]
    else:
        layer1_test_predictions[:, model_idx] = model.predict(testing_data)

final_prediction = meta_model.predict(layer1_test_predictions)
print(final_prediction)

submission = pd.DataFrame({
    'PassengerId': passengersId,
    'Transported': final_prediction.astype(bool)
})

submission.to_csv('../data/StackedPredictions.csv', index=False)
print("\n✓ Submission saved!")
print(submission.head())

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to 

[ True False  True ...  True  True  True]

✓ Submission saved!
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
