# Import packages and Settings

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

from xgboost.sklearn import XGBClassifier

from catboost import CatBoostClassifier

%matplotlib inline

In [2]:
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

label = 'Survived'

id = test_data['PassengerId']

In [4]:
models = [
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    XGBClassifier(random_state=0),
    CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
]

In [5]:
tuning_parameters = {
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [2, 4, 6, None]
    },
    'RandomForest': {
        'n_estimators': [50, 100, 300],
        'max_depth': [2, 4, 6, None]
    },
    'GradientBoosting': {
        'learning_rate': [0.2, 0.1, 0.05],
        'n_estimators': [50, 100, 300],
        'max_depth': [1, 3, 5, None]
    },
    'xgboost': {
        'learning_rate': [0.5, 0.3, 0.1],
        'max_depth': [4, 6, 8]
    },
    'catboost': {
        'learning_rate': [0.05, 0.03, 0.01],
        'iterations': [300, 500, 800],
        'depth': [4, 6, 8]
    }
}

# Functions

In [6]:
def makeOneHot(column, train_data, test_data):
    train_data_new = train_data.copy()
    test_data_new = test_data.copy()

    encoder_onehot = OneHotEncoder(sparse_output=False)

    train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data[column]))
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

    test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data[column]))
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

    train_data_new = pd.concat([train_data_new, train_data_onehot], axis=1)
    test_data_new = pd.concat([test_data_new, test_data_onehot], axis=1)

    train_data_new = train_data_new.drop(column, axis=1)
    test_data_new = test_data_new.drop(column, axis=1)

    return train_data_new, test_data_new

In [7]:
def prepareData(columns_to_drop, label, train_data, test_data):
    X_train = train_data.drop([label] + columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return X_train, Y_train, X_test

In [8]:
def evaluation(models, X_train, Y_train):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []

    cv = ShuffleSplit(random_state=0)

    for model in models:
        model = clone(model)
        cv_scores = cross_val_score(model, X_train, Y_train, cv=cv)
        model_scores.append(cv_scores.mean())

    cv_results = pd.DataFrame(columns=['name', 'accuracy_mean'])
    cv_results['name'] = model_names
    cv_results['accuracy_mean'] = model_scores

    return cv_results

In [9]:
def tuneParameters(models, tuning_parameters, X_train, Y_train):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []
    model_params = []

    cv = ShuffleSplit(random_state=0)

    for model, params in zip(models, tuning_parameters.items()):
        model = clone(model)
        tune_model = GridSearchCV(model, param_grid=params[1], scoring='accuracy', cv=cv)
        tune_model.fit(X_train, Y_train)
        model_scores.append(tune_model.best_score_)
        model_params.append(tune_model.best_params_)
    
    tuning_results = pd.DataFrame(columns=['name', 'best_score', 'best_params'])
    tuning_results['name'] = model_names
    tuning_results['best_score'] = model_scores
    tuning_results['best_params'] = model_params

    return tuning_results

In [10]:
def predict(model, X_train, Y_train, X_test, id, label, file_name = 'result.csv'):
    model = clone(model)
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)

    result = pd.DataFrame({
        id.name: id,
        label: predictions
    })
    result.to_csv('./Result/' + file_name, index=False)

# Base Model

In [11]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data, test_data)
print(X_train.columns)

Index(['Pclass', 'Sex', 'SibSp', 'Parch'], dtype='object')


In [32]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.795556
1      RandomForestClassifier       0.796667
2  GradientBoostingClassifier       0.813333
3               XGBClassifier       0.800000
4          CatBoostClassifier       0.810000


In [23]:
tuning_results = tuneParameters(models, tuning_parameters, X_train, Y_train)
print(tuning_results)

                         name  best_score  \
0      DecisionTreeClassifier    0.811111   
1      RandomForestClassifier    0.821111   
2  GradientBoostingClassifier    0.825556   
3               XGBClassifier    0.815556   
4          CatBoostClassifier    0.826667   

                                         best_params  
0              {'criterion': 'gini', 'max_depth': 4}  
1              {'max_depth': 2, 'n_estimators': 300}  
2  {'learning_rate': 0.1, 'max_depth': 1, 'n_esti...  
3             {'learning_rate': 0.1, 'max_depth': 4}  
4  {'depth': 6, 'iterations': 300, 'learning_rate...  


In [30]:
model = DecisionTreeClassifier(random_state=0, **tuning_results.iloc[0]['best_params'])
predict(model, X_train, Y_train, X_test, id, label, 'result_basic.csv')

# Features

In [34]:
def processFare(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    fill_value = train_data_new[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [35]:
def processAge(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'

    age_median_by_pclass = train_data_new[['Pclass', 'Age']].groupby('Pclass').median()

    for pclass, row in age_median_by_pclass.iterrows():
        train_data_new.loc[(train_data_new['Age'].isnull()) & (train_data_new['Pclass'] == pclass), 'Age'] = row.iloc[0]
        test_data_new.loc[(test_data_new['Age'].isnull()) & (test_data_new['Pclass'] == pclass), 'Age'] = row.iloc[0]

    return train_data_new, test_data_new

In [36]:
def processEmbarked(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Embarked'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new, test_data_new = makeOneHot('Embarked', train_data_new, test_data_new)

    return train_data_new, test_data_new

In [37]:
def processTitle(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Title'

    def process(name):
        new_feature = name.str.extract(r' ([A-Za-z]+)\.', expand=False)
        new_feature = new_feature.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
        new_feature = new_feature.replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        
        return new_feature
    
    train_data_new[feature] = process(train_data_new['Name'])
    test_data_new[feature] = process(test_data_new['Name'])

    train_data_new, test_data_new = makeOneHot(feature, train_data_new, test_data_new)

    return train_data_new, test_data_new

In [38]:
def processFamily(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    train_data_new['Family'] = train_data_new['SibSp'] + train_data_new['Parch'] + 1
    test_data_new['Family'] = test_data_new['SibSp'] + test_data_new['Parch'] + 1

    return train_data_new, test_data_new

In [39]:
def processAlone(train_data, test_data):
    train_data_new, test_data_new = processFamily(train_data, test_data)

    train_data_new['Alone'] = train_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)
    test_data_new['Alone'] = test_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)

    return train_data_new, test_data_new

In [55]:
def processFareBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    new_feature = 'FareBinned'

    fill_value = train_data[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.qcut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    train_data_new[new_feature] = train_data_new[new_feature].astype(int)

    def bin(x):
        if x < bins[1]:
            return 0
        elif x < bins[2]:
            return 1
        elif x < bins[3]:
            return 2
        elif x < bins[4]:
            return 3
        else:
            return 4

    test_data_new[new_feature] = test_data_new[feature].apply(bin)

    return train_data_new, test_data_new

In [41]:
age_median_by_pclass = train_data[['Pclass', 'Age']].groupby('Pclass').median()
age_median_by_pclass

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,37.0
2,29.0
3,24.0


In [56]:
def processAgeBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    new_feature = 'AgeBinned'

    age_median_by_pclass = train_data_new[['Pclass', 'Age']].groupby('Pclass').median()

    for pclass, row in age_median_by_pclass.iterrows():
        train_data_new.loc[(train_data_new['Age'].isnull()) & (train_data_new['Pclass'] == pclass), 'Age'] = row.iloc[0]
        test_data_new.loc[(test_data_new['Age'].isnull()) & (test_data_new['Pclass'] == pclass), 'Age'] = row.iloc[0]

    train_data_new[new_feature], bins = pd.qcut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    train_data_new[new_feature] = train_data_new[new_feature].astype(int)

    def bin(x):
        if x < bins[1]:
            return 0
        elif x < bins[2]:
            return 1
        elif x < bins[3]:
            return 2
        elif x < bins[4]:
            return 3
        else:
            return 4

    test_data_new[new_feature] = test_data_new[feature].apply(bin)

    return train_data_new, test_data_new

In [43]:
def processAgeMinor(train_data, test_data):
    train_data_new, test_data_new = processAge(train_data, test_data)

    feature = 'Age'
    new_feature = 'AgeMinor'
    minorAge = 16
    train_data_new[new_feature] = (train_data_new[feature] < minorAge).astype(int)
    test_data_new[new_feature] = (test_data_new[feature] < minorAge).astype(int)

    return train_data_new, test_data_new

In [44]:
def processTicketNumber(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    def get_ticket_number(ticket):
        num = ticket.split(' ')[-1]
        if num.isdigit():
            return int(num)
        else:
            return -1

    train_data_new['TicketNumber'] = train_data_new['Ticket'].apply(get_ticket_number)
    test_data_new['TicketNumber'] = test_data_new['Ticket'].apply(get_ticket_number)


    return train_data_new, test_data_new

In [45]:
def processSurvivedGroup(train_data, test_data):
    train_data_new, test_data_new = processTicketNumber(train_data, test_data)

    train_data_new['SurvivedGroup'] = 0.5
    test_data_new['SurvivedGroup'] = 0.5

    survive_mean_by_ticketnum = train_data_new[['TicketNumber', 'Survived']].groupby(['TicketNumber']).mean()

    for ticketnum, row in survive_mean_by_ticketnum.iterrows():
        train_data_new.loc[train_data_new['TicketNumber'] == ticketnum, 'SurvivedGroup'] = row.iloc[0]
        test_data_new.loc[test_data_new['TicketNumber'] == ticketnum, 'SurvivedGroup'] = row.iloc[0]

    return train_data_new, test_data_new

## Feature Selection

In [57]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)
train_data_new, test_data_new = processFareBin(train_data_new, test_data_new)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processAgeMinor(train_data_new, test_data_new)
train_data_new, test_data_new = processTicketNumber(train_data_new, test_data_new)
train_data_new, test_data_new = processSurvivedGroup(train_data_new, test_data_new)

In [58]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)
print(X_train.columns)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Rare', 'Family', 'Alone', 'FareBinned', 'AgeBinned',
       'AgeMinor', 'TicketNumber', 'SurvivedGroup'],
      dtype='object')


In [53]:
cv = ShuffleSplit(random_state=0)
model = RandomForestClassifier(random_state=0)
selector = RFECV(estimator=model, cv=cv, scoring='accuracy')
selector.fit(X_train, Y_train)

for feature, rank in zip(X_train.columns, selector.ranking_):
    print(feature, rank)

Pclass 6
Sex 2
Age 5
SibSp 11
Parch 13
Fare 3
Embarked_C 17
Embarked_Q 20
Embarked_S 15
Title_Master 14
Title_Miss 9
Title_Mr 1
Title_Mrs 8
Title_Rare 19
Family 7
Alone 16
FareBinned 10
AgeBinned 12
AgeMinor 18
TicketNumber 4
SurvivedGroup 1


# Training

In [59]:
columns_selected = ['SurvivedGroup', 'Sex', 'FareBinned', 'AgeMinor', 'Pclass']
cv_results = evaluation(models, X_train[columns_selected], Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.980000
1      RandomForestClassifier       0.985556
2  GradientBoostingClassifier       0.978889
3               XGBClassifier       0.975556
4          CatBoostClassifier       0.977778


In [60]:
tuning_results = tuneParameters(models, tuning_parameters, X_train[columns_selected], Y_train)
print(tuning_results)

                         name  best_score  \
0      DecisionTreeClassifier    0.982222   
1      RandomForestClassifier    0.985556   
2  GradientBoostingClassifier    0.982222   
3               XGBClassifier    0.981111   
4          CatBoostClassifier    0.981111   

                                         best_params  
0              {'criterion': 'gini', 'max_depth': 4}  
1               {'max_depth': 6, 'n_estimators': 50}  
2  {'learning_rate': 0.2, 'max_depth': 1, 'n_esti...  
3             {'learning_rate': 0.1, 'max_depth': 4}  
4  {'depth': 4, 'iterations': 300, 'learning_rate...  


# Prediction

In [62]:
model = RandomForestClassifier(random_state=0, **tuning_results.iloc[1]['best_params'])
predict(model, X_train[columns_selected], Y_train, X_test[columns_selected], id, label)

# Improvement

In [None]:
data = pd.concat([train_data, test_data], axis=0)

data['Fare_5'] = pd.qcut(data.Fare, 5, labels=[0, 1, 2, 3, 4]).astype(int)

data['Survival_group'] = 0.5

for _, group in data.groupby('Ticket'):
    if(len(group) > 1):
        for index, row in group.iterrows():
            smax = group.drop(index)['Survived'].max()
            smin = group.drop(index)['Survived'].min()
            id = row['PassengerId']
            if(smax == 1.0):
                data.loc[data['PassengerId'] == id, 'Survival_group'] = 1.0
            elif(smin == 0.0):
                data.loc[data['PassengerId'] == id, 'Survival_group'] = 0.0

data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Dona', 'Jonkheer', 'Major', 'Rev', 'Sir'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle', 'Ms', 'Mme'], 'Miss')
data['Title'] = data['Title'].replace(['Lady'], 'Mrs')

age_median = data.groupby('Title')['Age'].median().values
data['Age_group'] = data['Age']

data['Title_enc'] = data['Title'].map({'Mr': 0, 'Rare': 1, 'Master': 2, 'Miss': 3, 'Mrs': 4}).astype(int)
for i in range(0, 5):
    data.loc[(data['Age'].isnull())&(data['Title_enc']==i), 'Age_group'] = age_median[i]

data['Age_minor'] = (data['Age_group'] < 16.0).astype(int)

train = data[:len(train_data)]
test = data[len(train_data):]

features = ['Pclass', 'Sex_b', 'Fare_5_enc', 'Survival_group', 'Age_minor']

model = RandomForestClassifier(random_state=2, n_estimators=250, min_samples_split=20, oob_score=True)
model.fit(train[features], train['Survived'])

predictions = pd.DataFrame(model.predict(test[features]))

result = pd.concat([test.PassengerId, predictions], axis=1)
result.columns = ['PassengerId', 'Survived']

result.to_csv('./result.csv', index=False)