# Analysis

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

In [2]:
df_train = pd.read_csv("data/train.csv")
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df_train.loc[df_train["Sex"] == "male", "Sex"] = 1
df_train.loc[df_train["Sex"] == "female", "Sex"] = 0

df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C


In [5]:
df_train["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [6]:
onehotObject_ticket = OneHotEncoder(handle_unknown = 'ignore')
onehotObject_ticket.fit(np.array(df_train["Pclass"]).reshape(-1, 1))

tickets_columns = ["ticket_{0}".format(i) for i in onehotObject_ticket.categories_[0]]

onehot_ticket = pd.DataFrame(onehotObject_ticket.transform(np.array(df_train["Pclass"]).reshape(-1, 1)).toarray().astype(int), columns = tickets_columns)

df_merged = pd.merge(df_train, onehot_ticket, left_index = True, right_index = True)

df_merged.drop("Pclass", axis = 1, inplace = True)

df_merged

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_1,ticket_2,ticket_3
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,S,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S,1,0,0
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,S,0,1,0
887,888,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S,1,0,0
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,S,0,0,1
889,890,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C,1,0,0


In [7]:
df_train = df_merged

In [8]:
df_train["Embarked"].fillna("N", inplace = True)

df_train["Embarked"].value_counts()

S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64

In [9]:
onehotObject_embarked = OneHotEncoder(handle_unknown = 'ignore')
onehotObject_embarked.fit(np.array(df_train["Embarked"]).reshape(-1, 1))

embarked_columns = ["embarked_{0}".format(i) for i in onehotObject_embarked.categories_[0]]

onehotObject_embarked = pd.DataFrame(onehotObject_embarked.transform(np.array(df_train["Embarked"]).reshape(-1, 1)).toarray().astype(int), columns = embarked_columns)

df_merged = pd.merge(df_train, onehotObject_embarked, left_index = True, right_index = True)

df_merged.drop("Embarked", axis = 1, inplace = True)

df_merged

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,ticket_1,ticket_2,ticket_3,embarked_C,embarked_N,embarked_Q,embarked_S
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,0,0,1,0,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0,1,0,0,0
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1,0,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,1,0,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,0,1,0,0,0,0,1
887,888,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,1,0,0,0,0,0,1
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,0,0,1,0,0,0,1
889,890,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,1,0,0,1,0,0,0


In [10]:
df_train = df_merged

In [11]:
df_train["Age"].fillna(-1, inplace = True)

df_train["Age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    -1.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [12]:
ages_ = ["is_baby", "is_kid", "is_teen", "is_middle", "is_old"]

df_train["is_baby"] = (df_train["Age"] <= 5) & (df_train["Age"] > 0)
df_train["is_kid"] = (df_train["Age"] <= 12) & (df_train["Age"] > 5)
df_train["is_teen"] = (df_train["Age"] <= 20) & (df_train["Age"] > 12)
df_train["is_middle"] = (df_train["Age"] <= 60) & (df_train["Age"] > 20)
df_train["is_old"] = (df_train["Age"] > 60)


df_train

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,ticket_3,embarked_C,embarked_N,embarked_Q,embarked_S,is_baby,is_kid,is_teen,is_middle,is_old
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,...,1,0,0,0,1,False,False,False,True,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,...,0,1,0,0,0,False,False,False,True,False
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,...,1,0,0,0,1,False,False,False,True,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,...,0,0,0,0,1,False,False,False,True,False
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,...,1,0,0,0,1,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,...,0,0,0,0,1,False,False,False,True,False
887,888,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,...,0,0,0,0,1,False,False,True,False,False
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",0,-1.0,1,2,W./C. 6607,23.4500,,...,1,0,0,0,1,False,False,False,False,False
889,890,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,...,0,1,0,0,0,False,False,False,True,False


In [13]:
df_train["Cabin"].isna().sum()

687

In [14]:
df_train['Cabin'].fillna('N', inplace = True)

In [15]:
onehotObject_cabin = OneHotEncoder(handle_unknown = 'ignore')
onehotObject_cabin.fit(np.array(df_train["Cabin"]).reshape(-1, 1))

cabins_columns = ["cabin_{0}".format(i) for i in onehotObject_cabin.categories_[0]]

onehotObject_cabin = pd.DataFrame(onehotObject_cabin.transform(np.array(df_train["Cabin"]).reshape(-1, 1)).toarray().astype(int), columns = cabins_columns)

df_merged = pd.merge(df_train, onehotObject_cabin, left_index = True, right_index = True)

df_merged.drop("Cabin", axis = 1, inplace = True)

df_merged

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,ticket_1,...,cabin_F E69,cabin_F G63,cabin_F G73,cabin_F2,cabin_F33,cabin_F38,cabin_F4,cabin_G6,cabin_N,cabin_T
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,...,0,0,0,0,0,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,0,...,0,0,0,0,0,0,0,0,1,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,0,...,0,0,0,0,0,0,0,0,1,0
887,888,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,1,...,0,0,0,0,0,0,0,0,0,0
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",0,-1.0,1,2,W./C. 6607,23.4500,0,...,0,0,0,0,0,0,0,0,1,0
889,890,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_train = df_merged

In [17]:
df_train

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,ticket_1,...,cabin_F E69,cabin_F G63,cabin_F G73,cabin_F2,cabin_F33,cabin_F38,cabin_F4,cabin_G6,cabin_N,cabin_T
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,...,0,0,0,0,0,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,0,...,0,0,0,0,0,0,0,0,1,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,0,...,0,0,0,0,0,0,0,0,1,0
887,888,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,1,...,0,0,0,0,0,0,0,0,0,0
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",0,-1.0,1,2,W./C. 6607,23.4500,0,...,0,0,0,0,0,0,0,0,1,0
889,890,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,1,...,0,0,0,0,0,0,0,0,0,0


# Train

In [25]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

import copy

In [26]:
random_state_ = 42

cat_features_ = ["Sex", "SibSp", "Parch"] + ages_ + tickets_columns + embarked_columns + cabins_columns

X_features = np.array(cat_features_ + ['Fare', 'Age'])
y = "Survived"

XTrain, XTest, YTrain, YTest = train_test_split(df_train[X_features], df_train[y], test_size = 0.33, random_state = random_state_)

In [27]:
model_cbc = CatBoostClassifier(random_state = random_state_,
                               learning_rate = 0.03,
                               n_estimators = 1000,
                               max_depth = 6,
                               loss_function = 'Logloss',
                               silent = True)

model_cbc.fit(XTrain, YTrain, cat_features = cat_features_)

<catboost.core.CatBoostClassifier at 0x7efdf84245b0>

In [31]:
roc_auc_score(YTest, model_cbc.predict_proba(XTest).T[1])

0.9004761904761904

In [32]:
feature_importances = model_cbc.feature_importances_

X_features, feature_importances.astype(float)

(array(['Sex', 'SibSp', 'Parch', 'is_baby', 'is_kid', 'is_teen',
        'is_middle', 'is_old', 'ticket_1', 'ticket_2', 'ticket_3',
        'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S',
        'cabin_A10', 'cabin_A14', 'cabin_A16', 'cabin_A19', 'cabin_A20',
        'cabin_A23', 'cabin_A24', 'cabin_A26', 'cabin_A31', 'cabin_A32',
        'cabin_A34', 'cabin_A36', 'cabin_A5', 'cabin_A6', 'cabin_A7',
        'cabin_B101', 'cabin_B102', 'cabin_B18', 'cabin_B19', 'cabin_B20',
        'cabin_B22', 'cabin_B28', 'cabin_B3', 'cabin_B30', 'cabin_B35',
        'cabin_B37', 'cabin_B38', 'cabin_B39', 'cabin_B4', 'cabin_B41',
        'cabin_B42', 'cabin_B49', 'cabin_B5', 'cabin_B50',
        'cabin_B51 B53 B55', 'cabin_B57 B59 B63 B66', 'cabin_B58 B60',
        'cabin_B69', 'cabin_B71', 'cabin_B73', 'cabin_B77', 'cabin_B78',
        'cabin_B79', 'cabin_B80', 'cabin_B82 B84', 'cabin_B86',
        'cabin_B94', 'cabin_B96 B98', 'cabin_C101', 'cabin_C103',
        'cabin_C104', 'cabin_C106', 

In [35]:
features_scores = []

features_all = ["Sex", "SibSp", "Parch", "Age", "Fare", embarked_columns, tickets_columns, ages_]

for feature in features_all:
    print("---{0}---".format(feature))
    if (type(feature) == str): feature = [feature]

    _cat = None
    
    if (len(set(feature) & set(cat_features_)) > 0):
        _cat = list(set(feature) & set(cat_features_))
    
    XTrain, XTest, YTrain, YTest = train_test_split(df_train[feature], df_train[y], test_size = 0.33, random_state = random_state_)
    model = CatBoostClassifier(random_state = random_state_,
                               learning_rate = 0.03,
                               n_estimators = 1000,
                               max_depth = 6,
                               loss_function = 'Logloss',
                               silent = True)
    
    if (_cat): model.fit(XTrain, YTrain, cat_features = _cat)
    else: model.fit(XTrain, YTrain)
    
    score = roc_auc_score(YTest, model.predict_proba(XTest).T[1])
    
    print("ROC AUC: {0}".format(score))
    
    features_scores.append(score)

---Sex---
ROC AUC: 0.7814285714285715
---SibSp---
ROC AUC: 0.5800238095238095
---Parch---
ROC AUC: 0.569952380952381
---Age---
ROC AUC: 0.5514285714285714
---Fare---
ROC AUC: 0.7475476190476191
---['embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S']---
ROC AUC: 0.5625238095238095
---['ticket_1', 'ticket_2', 'ticket_3']---
ROC AUC: 0.7044761904761905
---['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old']---
ROC AUC: 0.5409761904761905


In [36]:
features_sorted = list(map(lambda x: features_all[features_scores.index(x)], sorted(features_scores)))

features_sorted

[['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old'],
 'Age',
 ['embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S'],
 'Parch',
 'SibSp',
 ['ticket_1', 'ticket_2', 'ticket_3'],
 'Fare',
 'Sex']

In [37]:
features_groups = []

i = 0

for feature_i in features_sorted:
    for feature_j in features_sorted:
        if (feature_i == feature_j): continue
        if (type(feature_i) == list and type(feature_j) == list):
            features_groups.append(feature_i + feature_j)
        else:
            if (type(feature_i) == list):
                feature_i_copy = copy.copy(feature_i)
                feature_i_copy.append(feature_j)
                features_groups.append(feature_i_copy)
            elif (type(feature_j) == list):
                feature_j_copy = copy.copy(feature_j)
                feature_j_copy.append(feature_i)
                features_groups.append(feature_j_copy)
            else:
                features_groups.append([feature_i, feature_j])

features_all = features_sorted + features_groups

features_all

[['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old'],
 'Age',
 ['embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S'],
 'Parch',
 'SibSp',
 ['ticket_1', 'ticket_2', 'ticket_3'],
 'Fare',
 'Sex',
 ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age'],
 ['is_baby',
  'is_kid',
  'is_teen',
  'is_middle',
  'is_old',
  'embarked_C',
  'embarked_N',
  'embarked_Q',
  'embarked_S'],
 ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Parch'],
 ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'SibSp'],
 ['is_baby',
  'is_kid',
  'is_teen',
  'is_middle',
  'is_old',
  'ticket_1',
  'ticket_2',
  'ticket_3'],
 ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Fare'],
 ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Sex'],
 ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age'],
 ['embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S', 'Age'],
 ['Age', 'Parch'],
 ['Age', 'SibSp'],
 ['ticket_1', 'ticket_2', 'ticket_3', 'Age'],
 ['Age', 'Fare']

In [41]:
features_current = []
scores_current = []
models_current = []

for feature in features_all:
    feature_this = copy.copy(feature)
    if (type(feature_this) == str):
        if (feature_this in features_current): continue
        features_current.append(feature_this)
    else:
        if (len(list(set(list(features_current)) & set(list(feature_this)))) > 0):
            for feature_intersection in list(set(features_current) & set(feature)):
                feature_this.remove(feature_intersection)
            if (len(feature_this) == 0): continue
        features_current += feature_this
    
    
    _cat = None
    
    if (len(set(features_current) & set(cat_features_)) > 0):
        _cat = list(set(features_current) & set(cat_features_))
    
    XTrain, XTest, YTrain, YTest = train_test_split(df_train[features_current], df_train[y], test_size = 0.33, random_state = random_state_)
    model = CatBoostClassifier(random_state = random_state_,
                               learning_rate = 0.03,
                               n_estimators = 1000,
                               max_depth = 6,
                               loss_function = 'Logloss',
                               silent = True)
    
    if (_cat): model.fit(XTrain, YTrain, cat_features = _cat)
    else: model.fit(XTrain, YTrain)
    
    score = roc_auc_score(YTest, model.predict_proba(XTest).T[1])
    
    if (len(scores_current) > 1 and score < scores_current[-1]):
        if (type(feature_this) == list):
            for _feature in feature_this:
                features_current.remove(_feature)
        else:
            features_current.remove(feature_this)
    else:
        print("{0} - {1}".format(features_current, score))
        scores_current.append(score)
        models_current.append(model)

['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old'] - 0.5409761904761905
['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age'] - 0.555595238095238
['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age', 'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S'] - 0.5999523809523809
['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age', 'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S', 'Parch'] - 0.6347619047619047
['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age', 'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S', 'Parch', 'SibSp'] - 0.7209285714285714
['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age', 'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S', 'Parch', 'SibSp', 'ticket_1', 'ticket_2', 'ticket_3'] - 0.7859761904761904
['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age', 'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S', 'Parch', 'SibSp', 'ticket_1', 'ticket_2', 'ticket_3', 'Fare'] - 0.78778571428571

In [42]:
scores_current[-1]

0.8959761904761905

In [43]:
print("Best features: {0}".format(features_current))

Best features: ['is_baby', 'is_kid', 'is_teen', 'is_middle', 'is_old', 'Age', 'embarked_C', 'embarked_N', 'embarked_Q', 'embarked_S', 'Parch', 'SibSp', 'ticket_1', 'ticket_2', 'ticket_3', 'Fare', 'Sex']


In [44]:
XTrain, XTest, YTrain, YTest = train_test_split(df_train[features_current], df_train[y], test_size = 0.33, random_state = random_state_)
model = CatBoostClassifier(random_state = random_state_,
                           learning_rate = 0.03,
                           n_estimators = 1000,
                           max_depth = 6,
                           loss_function = 'Logloss',
                           silent = True)

In [45]:
model.fit(XTrain, YTrain, cat_features = list(set(features_current) & set(cat_features_)))

<catboost.core.CatBoostClassifier at 0x7efdfa475910>

In [47]:
roc_auc_score(YTest, model.predict_proba(XTest).T[1])

0.8959761904761905

In [48]:
XTrain, XTest, YTrain, YTest = train_test_split(df_train[features_current], df_train[y], test_size = 0.33, random_state = random_state_)
model_grid = CatBoostClassifier(silent = True, loss_function = 'Logloss')

In [50]:
grid = {"n_estimators": np.arange(300, 1100, 100),
       "max_depth": np.arange(2, 9, 1),
       "learning_rate": np.arange(0.01, 0.06, 0.01),
       "random_state": [random_state_]}

In [51]:
params = model_grid.grid_search(grid, XTrain, YTrain)


bestTest = 0.4298784139
bestIteration = 299

0:	loss: 0.4298784	best: 0.4298784 (0)	total: 140ms	remaining: 39s

bestTest = 0.4131285068
bestIteration = 299

1:	loss: 0.4131285	best: 0.4131285 (1)	total: 243ms	remaining: 33.8s

bestTest = 0.4083295654
bestIteration = 295

2:	loss: 0.4083296	best: 0.4083296 (2)	total: 346ms	remaining: 32s

bestTest = 0.4050915831
bestIteration = 255

3:	loss: 0.4050916	best: 0.4050916 (3)	total: 456ms	remaining: 31.5s

bestTest = 0.4004253465
bestIteration = 255

4:	loss: 0.4004253	best: 0.4004253 (4)	total: 563ms	remaining: 31s

bestTest = 0.4197577422
bestIteration = 399

5:	loss: 0.4197577	best: 0.4004253 (4)	total: 699ms	remaining: 31.9s

bestTest = 0.4096537297
bestIteration = 398

6:	loss: 0.4096537	best: 0.4004253 (4)	total: 840ms	remaining: 32.8s

bestTest = 0.4051403149
bestIteration = 393

7:	loss: 0.4051403	best: 0.4004253 (4)	total: 975ms	remaining: 33.1s

bestTest = 0.4050915831
bestIteration = 255

8:	loss: 0.4050916	best: 0.4004253 (4)	t


bestTest = 0.3916451888
bestIteration = 563

71:	loss: 0.3916452	best: 0.3899270 (47)	total: 20.6s	remaining: 59.5s

bestTest = 0.3899270138
bestIteration = 358

72:	loss: 0.3899270	best: 0.3899270 (47)	total: 21s	remaining: 59.6s

bestTest = 0.3920776397
bestIteration = 312

73:	loss: 0.3920776	best: 0.3899270 (47)	total: 21.5s	remaining: 59.7s

bestTest = 0.3936175897
bestIteration = 193

74:	loss: 0.3936176	best: 0.3899270 (47)	total: 21.9s	remaining: 59.8s

bestTest = 0.3925665197
bestIteration = 979

75:	loss: 0.3925665	best: 0.3899270 (47)	total: 22.4s	remaining: 1m

bestTest = 0.3916451888
bestIteration = 563

76:	loss: 0.3916452	best: 0.3899270 (47)	total: 22.8s	remaining: 1m

bestTest = 0.3899270138
bestIteration = 358

77:	loss: 0.3899270	best: 0.3899270 (47)	total: 23.3s	remaining: 1m

bestTest = 0.3920776397
bestIteration = 312

78:	loss: 0.3920776	best: 0.3899270 (47)	total: 23.9s	remaining: 1m

bestTest = 0.3936175897
bestIteration = 193

79:	loss: 0.3936176	best: 0.3899


bestTest = 0.3853357622
bestIteration = 157

142:	loss: 0.3853358	best: 0.3806897 (87)	total: 48.5s	remaining: 46.4s

bestTest = 0.39027917
bestIteration = 223

143:	loss: 0.3902792	best: 0.3806897 (87)	total: 48.9s	remaining: 46.2s

bestTest = 0.3909277839
bestIteration = 205

144:	loss: 0.3909278	best: 0.3806897 (87)	total: 49.4s	remaining: 46s

bestTest = 0.3868177783
bestIteration = 718

145:	loss: 0.3868178	best: 0.3806897 (87)	total: 49.9s	remaining: 45.8s

bestTest = 0.3829424486
bestIteration = 283

146:	loss: 0.3829424	best: 0.3806897 (87)	total: 50.6s	remaining: 45.8s

bestTest = 0.3853357622
bestIteration = 157

147:	loss: 0.3853358	best: 0.3806897 (87)	total: 51.2s	remaining: 45.7s

bestTest = 0.39027917
bestIteration = 223

148:	loss: 0.3902792	best: 0.3806897 (87)	total: 51.7s	remaining: 45.5s

bestTest = 0.3909277839
bestIteration = 205

149:	loss: 0.3909278	best: 0.3806897 (87)	total: 52.4s	remaining: 45.4s

bestTest = 0.3868177783
bestIteration = 718

150:	loss: 0.386


bestTest = 0.3708138377
bestIteration = 175

212:	loss: 0.3708138	best: 0.3708138 (202)	total: 1m 30s	remaining: 28.6s

bestTest = 0.3804788197
bestIteration = 176

213:	loss: 0.3804788	best: 0.3708138 (202)	total: 1m 31s	remaining: 28.2s

bestTest = 0.3852727846
bestIteration = 77

214:	loss: 0.3852728	best: 0.3708138 (202)	total: 1m 32s	remaining: 27.8s

bestTest = 0.3788703345
bestIteration = 547

215:	loss: 0.3788703	best: 0.3708138 (202)	total: 1m 32s	remaining: 27.5s

bestTest = 0.379295374
bestIteration = 235

216:	loss: 0.3792954	best: 0.3708138 (202)	total: 1m 33s	remaining: 27.1s

bestTest = 0.3708138377
bestIteration = 175

217:	loss: 0.3708138	best: 0.3708138 (202)	total: 1m 34s	remaining: 26.8s

bestTest = 0.3804788197
bestIteration = 176

218:	loss: 0.3804788	best: 0.3708138 (202)	total: 1m 35s	remaining: 26.5s

bestTest = 0.3852727846
bestIteration = 77

219:	loss: 0.3852728	best: 0.3708138 (202)	total: 1m 35s	remaining: 26.1s

bestTest = 0.3788703345
bestIteration = 54

In [52]:
hyperparams = params['params']

hyperparams

{'depth': 8, 'random_seed': 42, 'iterations': 300, 'learning_rate': 0.03}

In [61]:
scores = []

for lr in grid['learning_rate']:
    for iter_ in grid['iterations']:
        for d in grid['depth']:
            XTrain, XTest, YTrain, YTest = train_test_split(df_train[features_current], df_train[y], test_size = 0.33, random_state = random_state_)
            
            model = CatBoostClassifier(random_state = random_state_,
                                       learning_rate = lr,
                                       iterations = iter_, 
                                       depth = d,
                                       loss_function = 'Logloss',
                                       silent = True)
            
            model.fit(XTrain, YTrain, cat_features = list(set(features_current) & set(cat_features_)))
            
            score = roc_auc_score(YTest, model.predict_proba(XTest).T[1])
            
            print("{0} - {1}".format((lr, iter_, d), score))
            
            scores.append(((lr, iter_, d), score))


(0.01, 300, 2) - 0.8803333333333333
(0.01, 300, 3) - 0.8865476190476191
(0.01, 300, 4) - 0.8878095238095237
(0.01, 300, 5) - 0.8962142857142856
(0.01, 300, 6) - 0.8965
(0.01, 300, 7) - 0.9039761904761905
(0.01, 300, 8) - 0.8992380952380952
(0.01, 400, 2) - 0.8833095238095237
(0.01, 400, 3) - 0.8881428571428571
(0.01, 400, 4) - 0.8895
(0.01, 400, 5) - 0.897
(0.01, 400, 6) - 0.8970238095238094
(0.01, 400, 7) - 0.9027857142857144
(0.01, 400, 8) - 0.8999999999999999
(0.01, 500, 2) - 0.8843571428571428
(0.01, 500, 3) - 0.8905714285714286
(0.01, 500, 4) - 0.8925476190476188
(0.01, 500, 5) - 0.8969999999999999
(0.01, 500, 6) - 0.8987380952380952
(0.01, 500, 7) - 0.9035000000000001
(0.01, 500, 8) - 0.9011904761904762
(0.01, 600, 2) - 0.8858095238095238
(0.01, 600, 3) - 0.8907857142857142
(0.01, 600, 4) - 0.8936190476190475
(0.01, 600, 5) - 0.8980476190476191
(0.01, 600, 6) - 0.8974285714285715
(0.01, 600, 7) - 0.9047142857142857
(0.01, 600, 8) - 0.9023333333333333
(0.01, 700, 2) - 0.8853333333

(0.05, 400, 2) - 0.8866428571428571
(0.05, 400, 3) - 0.8913571428571427
(0.05, 400, 4) - 0.893547619047619
(0.05, 400, 5) - 0.8992619047619047
(0.05, 400, 6) - 0.9017619047619047
(0.05, 400, 7) - 0.8999761904761905
(0.05, 400, 8) - 0.8943095238095238
(0.05, 500, 2) - 0.8853571428571428
(0.05, 500, 3) - 0.8928095238095238
(0.05, 500, 4) - 0.8937857142857143
(0.05, 500, 5) - 0.8973809523809523
(0.05, 500, 6) - 0.8985238095238094
(0.05, 500, 7) - 0.8972142857142856
(0.05, 500, 8) - 0.8936190476190475
(0.05, 600, 2) - 0.8830952380952382
(0.05, 600, 3) - 0.894047619047619
(0.05, 600, 4) - 0.8937142857142857
(0.05, 600, 5) - 0.8950952380952379
(0.05, 600, 6) - 0.8965714285714286
(0.05, 600, 7) - 0.8966428571428571
(0.05, 600, 8) - 0.8900952380952382
(0.05, 700, 2) - 0.8844761904761904
(0.05, 700, 3) - 0.8934285714285716
(0.05, 700, 4) - 0.8896666666666666
(0.05, 700, 5) - 0.8913809523809524
(0.05, 700, 6) - 0.893
(0.05, 700, 7) - 0.8944047619047619
(0.05, 700, 8) - 0.8902857142857143
(0.05, 

In [69]:
scores_np = np.array(scores).T

scores_np[0][np.where(scores_np[1] == max(scores_np[1]))], max(scores_np[1])

  scores_np = np.array(scores).T


(array([(0.04, 300, 7)], dtype=object), 0.9092142857142858)

In [92]:
seed = 42

# XTrain, XTest, YTrain, YTest = train_test_split(df_train[features_current], df_train[y], test_size = 0.33, random_state = hyperparams['random_seed'])
XTrain, XTest, YTrain, YTest = train_test_split(df_train[features_current], df_train[y], test_size = 0.33, random_state = seed)
# model = CatBoostClassifier(random_state = hyperparams['random_seed'],
#                            learning_rate = hyperparams['learning_rate'],
#                            iterations = hyperparams['iterations'], 
#                            depth = hyperparams['depth'],
#                            loss_function = 'Logloss',
#                            silent = True)
model_final = CatBoostClassifier(random_state = seed,
                           learning_rate = 0.04,
                           iterations = 300, 
                           depth = 7,
                           loss_function = 'Logloss',
                           silent = True)

In [93]:
model_final.fit(XTrain, YTrain, cat_features = list(set(features_current) & set(cat_features_)))

<catboost.core.CatBoostClassifier at 0x7efdf8ba72b0>

In [94]:
roc_auc_score(YTest, model_final.predict_proba(XTest).T[1])

0.9092142857142858

# Prediction

## Preprocessing

In [73]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

df_test = pd.read_csv("data/test.csv")

In [74]:
df_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [75]:
df_test.loc[df_test["Sex"] == "male", "Sex"] = 1
df_test.loc[df_test["Sex"] == "female", "Sex"] = 0

df_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",1,,0,0,359309,8.0500,,S


In [76]:
onehotObject_ticket = OneHotEncoder(handle_unknown = 'ignore')
onehotObject_ticket.fit(np.array(df_test["Pclass"]).reshape(-1, 1))

tickets_columns = ["ticket_{0}".format(i) for i in onehotObject_ticket.categories_[0]]

onehot_ticket = pd.DataFrame(onehotObject_ticket.transform(np.array(df_test["Pclass"]).reshape(-1, 1)).toarray().astype(int), columns = tickets_columns)

df_test = pd.merge(df_test, onehot_ticket, left_index = True, right_index = True)

df_test.drop("Pclass", axis = 1, inplace = True)

df_test

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_1,ticket_2,ticket_3
0,892,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,0,0,1
1,893,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,S,0,0,1
2,894,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,0,1,0
3,895,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,0,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",1,,0,0,A.5. 3236,8.0500,,S,0,0,1
414,1306,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,C,1,0,0
415,1307,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1
416,1308,"Ware, Mr. Frederick",1,,0,0,359309,8.0500,,S,0,0,1


In [77]:
df_test["Embarked"].iloc[5] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [78]:
df_test["Embarked"].fillna("N", inplace = True)

df_test["Embarked"].value_counts()

S    269
C    102
Q     46
N      1
Name: Embarked, dtype: int64

In [79]:
onehotObject_embarked = OneHotEncoder(handle_unknown = 'ignore')
onehotObject_embarked.fit(np.array(df_test["Embarked"]).reshape(-1, 1))

embarked_columns = ["embarked_{0}".format(i) for i in onehotObject_embarked.categories_[0]]

onehotObject_embarked = pd.DataFrame(onehotObject_embarked.transform(np.array(df_test["Embarked"]).reshape(-1, 1)).toarray().astype(int), columns = embarked_columns)

df_test = pd.merge(df_test, onehotObject_embarked, left_index = True, right_index = True)

df_test.drop("Embarked", axis = 1, inplace = True)

df_test

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,ticket_1,ticket_2,ticket_3,embarked_C,embarked_N,embarked_Q,embarked_S
0,892,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,0,0,1,0,0,1,0
1,893,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,0,0,1,0,0,0,1
2,894,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,0,1,0,0,0,1,0
3,895,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,0,0,1,0,0,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",1,,0,0,A.5. 3236,8.0500,,0,0,1,0,0,0,1
414,1306,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,1,0,0,1,0,0,0
415,1307,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0,0,1,0,0,0,1
416,1308,"Ware, Mr. Frederick",1,,0,0,359309,8.0500,,0,0,1,0,0,0,1


In [80]:
df_test['Age'].fillna(-1, inplace = True)

In [81]:
df_test['Survived'] = np.zeros(df_test.shape[0])

In [82]:
ages_ = ["is_baby", "is_kid", "is_teen", "is_middle", "is_old"]

df_test["is_baby"] = (df_test["Age"] <= 5) & (df_test["Age"] > 0)
df_test["is_kid"] = (df_test["Age"] <= 12) & (df_test["Age"] > 5)
df_test["is_teen"] = (df_test["Age"] <= 20) & (df_test["Age"] > 12)
df_test["is_middle"] = (df_test["Age"] <= 60) & (df_test["Age"] > 20)
df_test["is_old"] = (df_test["Age"] > 60)


df_test

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,ticket_1,...,embarked_C,embarked_N,embarked_Q,embarked_S,Survived,is_baby,is_kid,is_teen,is_middle,is_old
0,892,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,0,...,0,0,1,0,0.0,False,False,False,True,False
1,893,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,0,...,0,0,0,1,0.0,False,False,False,True,False
2,894,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,0,...,0,0,1,0,0.0,False,False,False,False,True
3,895,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,0,...,0,0,0,1,0.0,False,False,False,True,False
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,0,...,0,0,0,1,0.0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",1,-1.0,0,0,A.5. 3236,8.0500,,0,...,0,0,0,1,0.0,False,False,False,False,False
414,1306,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,1,...,1,0,0,0,0.0,False,False,False,True,False
415,1307,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0,...,0,0,0,1,0.0,False,False,False,True,False
416,1308,"Ware, Mr. Frederick",1,-1.0,0,0,359309,8.0500,,0,...,0,0,0,1,0.0,False,False,False,False,False


In [83]:
df_test['Cabin'].fillna('N', inplace = True)

In [84]:
onehotObject_cabin = OneHotEncoder(handle_unknown = 'ignore')
onehotObject_cabin.fit(np.array(df_test["Cabin"]).reshape(-1, 1))

cabins_columns = ["cabin_{0}".format(i) for i in onehotObject_cabin.categories_[0]]

onehotObject_cabin = pd.DataFrame(onehotObject_cabin.transform(np.array(df_test["Cabin"]).reshape(-1, 1)).toarray().astype(int), columns = cabins_columns)

df_test = pd.merge(df_test, onehotObject_cabin, left_index = True, right_index = True)

df_test.drop("Cabin", axis = 1, inplace = True)

df_test

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,ticket_1,ticket_2,...,cabin_E60,cabin_F,cabin_F E46,cabin_F E57,cabin_F G63,cabin_F2,cabin_F33,cabin_F4,cabin_G6,cabin_N
0,892,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,0,0,...,0,0,0,0,0,0,0,0,0,1
1,893,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,0,0,...,0,0,0,0,0,0,0,0,0,1
2,894,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,0,1,...,0,0,0,0,0,0,0,0,0,1
3,895,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,0,0,...,0,0,0,0,0,0,0,0,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",1,-1.0,0,0,A.5. 3236,8.0500,0,0,...,0,0,0,0,0,0,0,0,0,1
414,1306,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,1,0,...,0,0,0,0,0,0,0,0,0,0
415,1307,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,0,0,...,0,0,0,0,0,0,0,0,0,1
416,1308,"Ware, Mr. Frederick",1,-1.0,0,0,359309,8.0500,0,0,...,0,0,0,0,0,0,0,0,0,1


## Prediction

In [85]:
features_current

['is_baby',
 'is_kid',
 'is_teen',
 'is_middle',
 'is_old',
 'Age',
 'embarked_C',
 'embarked_N',
 'embarked_Q',
 'embarked_S',
 'Parch',
 'SibSp',
 'ticket_1',
 'ticket_2',
 'ticket_3',
 'Fare',
 'Sex']

In [86]:
_cat = list(set(features_current) & set(cat_features_))

In [95]:
df_test['Survived'] = model_final.predict(df_test[features_current])

In [96]:
df_test

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,ticket_1,ticket_2,...,cabin_E60,cabin_F,cabin_F E46,cabin_F E57,cabin_F G63,cabin_F2,cabin_F33,cabin_F4,cabin_G6,cabin_N
0,892,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,0,0,...,0,0,0,0,0,0,0,0,0,1
1,893,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,0,0,...,0,0,0,0,0,0,0,0,0,1
2,894,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,0,1,...,0,0,0,0,0,0,0,0,0,1
3,895,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,0,0,...,0,0,0,0,0,0,0,0,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,"Spector, Mr. Woolf",1,-1.0,0,0,A.5. 3236,8.0500,0,0,...,0,0,0,0,0,0,0,0,0,1
414,1306,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,1,0,...,0,0,0,0,0,0,0,0,0,0
415,1307,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,0,0,...,0,0,0,0,0,0,0,0,0,1
416,1308,"Ware, Mr. Frederick",1,-1.0,0,0,359309,8.0500,0,0,...,0,0,0,0,0,0,0,0,0,1


In [97]:
df_test_result = df_test[["PassengerId", "Survived"]]

In [98]:
df_test_result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [99]:
df_test_result.to_csv("data/test_result.csv", index = False)