## Imports

In [159]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

## Load Data

In [160]:
raw_data = pd.read_csv('train.csv')

In [161]:
raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [162]:
data = raw_data.copy()

## Data Preprocessing

In [163]:
train_labels = data['Survived']
data.drop('Survived', axis=1, inplace=True)

In [164]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [165]:
from sklearn.base import BaseEstimator, TransformerMixin

class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.age_mean = X['Age'].mean()
        self.fare_mean = X['Fare'].mean()
        return self
    def transform(self, X, y=None):
        # Drop unnecessary features
        X.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
        
        # Maps
        X['Embarked'] = X['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
        X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
        
        # New attribute adds number of siblings and parents
        X['FamCount'] = X['SibSp'] + X['Parch']
        X.drop(['SibSp', 'Parch'], axis=1, inplace=True)
        
        # Title attribute extracts titles from name feature
        title = [i.split(",")[1].split(".")[0].strip() for i in X["Name"]]
        X["Title"] = pd.Series(title)
        X.drop('Name', axis=1, inplace=True)
        X['Title'] = X['Title'].map({'Mr': 0, 'Mrs': 1, 'Miss': 4, 'Mme': 1, 'Ms': 1, 'Master': 2, 
                                             'Don': 3, 'Rev': 3, 'Dr': 3, 'Major': 3, 'Lady': 1, 'Sir': 0, 'Mlle': 3, 
                                             'Col': 3, 'Capt': 3, 'the Countess': 3, 'Jonkheer': 3})
        
        # Fill NA
        X['Age'].fillna(self.age_mean, inplace=True)
        X['Embarked'].fillna(0, inplace=True)
        X['Fare'].fillna(self.fare_mean, inplace=True)
        X['Title'].fillna(0, inplace=True)
        
        # One Hot Encode
        X = pd.get_dummies(X, columns=['Pclass', 'Sex', 'Embarked', 'FamCount', 'Title'])
        
        return X

In [166]:
preprocess = Preprocessor()

In [167]:
train_processed = preprocess.fit_transform(data)

In [168]:
train_processed.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,Embarked_0.0,Embarked_1.0,Embarked_2.0,FamCount_0,FamCount_1,FamCount_2,FamCount_3,FamCount_4,FamCount_5,FamCount_6,FamCount_7,FamCount_10,Title_0,Title_1,Title_2,Title_3,Title_4
0,22.0,7.25,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,38.0,71.2833,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,26.0,7.925,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,35.0,53.1,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
4,35.0,8.05,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0


## Model Selection

In [169]:
from sklearn.model_selection import cross_val_score

In [170]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
def score_model(model):
    scores = cross_val_score(model, train_processed, train_labels,
                            scoring="accuracy", cv=10)
    display_scores(scores)

In [171]:
from xgboost import XGBClassifier

In [172]:
xgb = XGBClassifier()
score_model(xgb)

Scores: [0.8        0.82222222 0.78651685 0.8988764  0.87640449 0.83146067
 0.85393258 0.7752809  0.85393258 0.84090909]
Mean: 0.8339535807513336
Standard deviation: 0.03721984020292451


## Fine Tuning

In [146]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

def random_search(model, param_distribs):
    rnd_search = RandomizedSearchCV(model, param_distributions=param_distribs,
                                n_iter=100, cv=5, scoring='accuracy', random_state=42)
    rnd_search.fit(train_processed, train_labels)
    print(rnd_search.best_params_)
    print(rnd_search.best_score_)

In [147]:
from sklearn.model_selection import GridSearchCV

In [98]:
from scipy.stats import uniform

xgb_param_distribs = {
      'n_estimators': randint(100, 1000),
      'learning_rate': uniform(0.01, 0.6),
      'subsample': uniform(),
      'max_depth': [3, 4, 5, 6, 7, 8, 9],
      'colsample_bytree': uniform(),
      'min_child_weight': [1, 2, 3, 4]
}

random_search(xgb, xgb_param_distribs)

{'colsample_bytree': 0.5398410913016731, 'learning_rate': 0.13183673484086164, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 938, 'subsample': 0.289751452913768}
0.8439955106621774


## Test Submission

In [173]:
test_data = pd.read_csv('test.csv')

In [174]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [175]:
test_ids = test_data['PassengerId']

In [176]:
test_processed = preprocess.transform(test_data)

In [187]:
xgb_final = XGBClassifier(colsample_bytree=0.8331949117361643, 
                          learning_rate=0.11401879210466324, 
                          max_depth=3, 
                          min_child_weight=4, 
                          n_estimators=661, 
                          subsample=0.662522284353982)

xgb_final.fit(train_processed.as_matrix(), train_labels)

  


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8331949117361643, gamma=0,
       learning_rate=0.11401879210466324, max_delta_step=0, max_depth=3,
       min_child_weight=4, missing=None, n_estimators=661, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.662522284353982)

In [188]:
pd.set_option('display.max_columns', None)
train_processed.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,Embarked_0.0,Embarked_1.0,Embarked_2.0,FamCount_0,FamCount_1,FamCount_2,FamCount_3,FamCount_4,FamCount_5,FamCount_6,FamCount_7,FamCount_10,Title_0,Title_1,Title_2,Title_3,Title_4
0,22.0,7.25,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,38.0,71.2833,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,26.0,7.925,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,35.0,53.1,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
4,35.0,8.05,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [189]:
test_processed.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,Embarked_0,Embarked_1,Embarked_2,FamCount_0,FamCount_1,FamCount_2,FamCount_3,FamCount_4,FamCount_5,FamCount_6,FamCount_7,FamCount_10,Title_0,Title_1,Title_2,Title_3,Title_4
0,34.5,7.8292,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1,47.0,7.0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,62.0,9.6875,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,27.0,8.6625,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,22.0,12.2875,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [190]:
preds = xgb_final.predict(test_processed.as_matrix())

  """Entry point for launching an IPython kernel.


In [191]:
import csv

values = []
for i in range(len(preds)):
    values.append([test_ids[i], preds[i]])

with open('submission_2', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['PassengerId', 'Survived'])
    writer.writerows(values)