In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost
from xgboost import XGBRFClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
print(xgboost.__version__)

1.5.2


In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
example_submission = pd.read_csv("data/gender_submission.csv")

print(len(train_data))
print(len(test_data))

891
418


In [3]:
survive_labels = train_data['Survived']
train_data = train_data.drop('Survived', axis=1)

full_data = train_data.append(test_data)
full_data = full_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
full_data = full_data.reset_index().drop(['index'], axis=1)

In [4]:
# encode embarked
full_data.loc[
    full_data['Embarked'] == 'S',
    'Embarked'
] = 0
full_data.loc[
    full_data['Embarked'] == 'Q',
    'Embarked'
] = 1
full_data.loc[
    full_data['Embarked'] == 'C',
    'Embarked'
] = 2

In [5]:
# encode fare
quantiles = full_data['Fare'].quantile([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], interpolation='nearest')
list_of_quantile_values = [(key, value) for key,value in quantiles.items()]

for i in range (1, len(list_of_quantile_values)):
    full_data.loc[
        (list_of_quantile_values[i][1] >= full_data['Fare']) & (full_data['Fare'] > list_of_quantile_values[i-1][1]),
        'Fare'
    ] = list_of_quantile_values[i-1][0]

In [6]:
# encode cabin
for i in range(len(full_data)):
    if not pd.isna(full_data.iloc[i]['Cabin']) and type(full_data.iloc[i]['Cabin'] == str):
        cabin_num = ord(full_data.iloc[i]['Cabin'][0]) - 64
        full_data.loc[full_data.index[i], 'Cabin'] = cabin_num

In [7]:
# encode age
def custom_round(x, base=5):
    if 0 < x < 120:
        return int(base * round(float(x)/base))
full_data.Age = full_data.Age.apply(lambda x: custom_round(x, base=5))

In [8]:
full_data = full_data.apply(LabelEncoder().fit_transform)

In [9]:
to_train = full_data.loc[:890]
to_test = full_data.loc[891:]

In [10]:
x_train, x_test, y_train, y_test = train_test_split(to_train, survive_labels, test_size=0.3)

In [11]:
test_model = XGBRFClassifier(
    n_estimators=11,
    max_leaves=0,
    use_label_encoder=False,
    eval_metric='error',
)
test_model.fit(x_train, y_train)
test_model.score(x_test, y_test)

0.8134328358208955

In [12]:
full_model = XGBRFClassifier(
    n_estimators=11,
    max_leaves=0,
    use_label_encoder=False,
    eval_metric='error',
)
full_model.fit(to_train, survive_labels)

XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, enable_categorical=False,
                eval_metric='error', gamma=0, gpu_id=-1, importance_type=None,
                interaction_constraints='', max_delta_step=0, max_depth=6,
                max_leaves=0, min_child_weight=1, missing=nan,
                monotone_constraints='()', n_estimators=11, n_jobs=12,
                num_parallel_tree=11, objective='binary:logistic',
                predictor='auto', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact',
                use_label_encoder=False, validate_parameters=1, verbosity=None)

In [13]:
predictions = full_model.predict(to_test)

In [14]:
to_submit = example_submission
to_submit.Survived = predictions

In [15]:
to_submit.to_csv("submit_xgbrfc.csv", index=False)

In [16]:
to_submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
