In [177]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
import os

from sklearn.preprocessing import StandardScaler

# use data spliter to simplfy the train and test data organisation
from sklearn.model_selection import train_test_split
# use random forest as the model, benfits from not needing the data scaled 
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

seed = 42

In [178]:
train0 = pd.read_csv( '../input/train.csv')
test0 = pd.read_csv('../input/test.csv')

n_train = train0.shape[0]
n_test = test0.shape[0]

In [179]:
# a map of more aggregated titles
Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
                    }

titles = set()
for _, title in Title_Dictionary.items():
    titles.add(title)

train = train0.copy()
test = test0.copy()

full_data = [train, test]

# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)


# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
# Remove all NULLS in the Embarked column (fill with most common category, which is 'S')
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# create new feature title, extracted from full name, and map to one of 6 title categories
for dataset in full_data:
    dataset['Title'] = dataset['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    # we map each title
    dataset['Title'] = dataset.Title.map(Title_Dictionary)

# map categorical features to numerical (consider creating dummies for non ordinal!)
for dataset in full_data:
    # Mapping Sex
    #dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    dataset['Sex'] = dataset['Sex'].astype('category').cat.codes
    # dataset['Embarked'] = dataset['Embarked'].astype('category').cat.codes

    
# For the age we do sth a bit more elaborate. Instead of using the median of the entire dataset
# we calculate the median age for a finer grouping, namely by (Sex, Pclass, Title)
# We then fill all the null age values with the median corresponding to (Sex, Pclass, Title) of that entry

# calculate median of ENTIRE dataset (including test)
combined = train.append(test)
combined.reset_index(inplace=True)  # probably not necessary, but do it anyway
combined.drop('index', inplace=True, axis=1)  # probably not necessary, but do it anyway
age_medians = combined[['Sex', 'Pclass', 'Title', 'Age']].groupby(['Sex', 'Pclass', 'Title']).median()

def fill_age(row):
    return age_medians.loc[(row['Sex'], row['Pclass'], row['Title']), 'Age']

for dataset in full_data:
    age_msk = dataset['Age'].isnull()  # select entries with missing age
    dataset.loc[age_msk, 'Age'] = dataset.loc[age_msk].apply(fill_age, axis=1)

# create additional ordinal categorical age feature
n_age_bins = 5
train['CategoricalAge'], age_bins = pd.cut(train['Age'], n_age_bins, retbins=True)

train = pd.get_dummies(train, columns=['Embarked'])
test = pd.get_dummies(test, columns=['Embarked'])

train = pd.get_dummies(train, columns=['Title'])
test = pd.get_dummies(test, columns=['Title'])

In [180]:
drop_elements = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
train = train.drop(drop_elements, axis = 1)
test = test.drop(drop_elements, axis = 1)

train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Has_Cabin,FamilySize,IsAlone,CategoricalAge,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,1,0,3,1,22.0,7.25,0,2,0,"(16.336, 32.252]",0,0,1,0,0,1,0,0,0
1,2,1,1,0,38.0,71.2833,1,2,0,"(32.252, 48.168]",1,0,0,0,0,0,1,0,0
2,3,1,3,0,26.0,7.925,0,1,1,"(16.336, 32.252]",0,0,1,0,1,0,0,0,0
3,4,1,1,0,35.0,53.1,1,2,0,"(32.252, 48.168]",0,0,1,0,0,0,1,0,0
4,5,0,3,1,35.0,8.05,0,1,1,"(32.252, 48.168]",0,0,1,0,0,1,0,0,0
5,6,0,3,1,26.0,8.4583,0,1,1,"(16.336, 32.252]",0,1,0,0,0,1,0,0,0
6,7,0,1,1,54.0,51.8625,1,1,1,"(48.168, 64.084]",0,0,1,0,0,1,0,0,0
7,8,0,3,1,2.0,21.075,0,5,0,"(0.34, 16.336]",0,0,1,1,0,0,0,0,0
8,9,1,3,0,27.0,11.1333,0,3,0,"(16.336, 32.252]",0,0,1,0,0,0,1,0,0
9,10,1,2,0,14.0,30.0708,0,2,0,"(0.34, 16.336]",1,0,0,0,0,0,1,0,0


In [181]:
# wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'FamilySize', 'IsAlone',\
                   #'Embarked_C', 'Embarked_Q', 'Embarked_S']
# wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'IsAlone']
# wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'FamilySize']
# wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'IsAlone', 'FamilySize', 'Embarked']
wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'IsAlone', 'FamilySize',
                   'Embarked_C', 'Embarked_Q', 'Embarked_S',
                   'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty']


X_train = train[wanted_features].values
X_test = test[wanted_features].values

Y_train = train['Survived'].values
n_features = X_train.shape[1]


In [182]:
# split into test and training data
do_cv = False

if do_cv:
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=seed)

In [183]:
# create model - We use Random Forest
parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

clf = RandomForestClassifier(**parameters)

# train model
clf.fit(X_train, Y_train)


RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=6, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [184]:
#  check the score of the model /accurancy/ precision recall
if do_cv:
    clf.score(X_val, Y_val)
    Y_pred = clf.predict(X_val)

    print('Accuracy:')
    print(accuracy_score(Y_val, Y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(Y_val, Y_pred))
    print(classification_report(Y_val, Y_pred))


In [185]:
def fix_param_string(params):
    out = ''
    for param, val in params.items():
        lst = param.split('_')
        paramstr = ''
        for term in lst:
            paramstr += term.capitalize()
        out += '_' + paramstr + str(val).capitalize()
    return out

param_str = fix_param_string(parameters)
file_name = 'predictions_RF' + param_str + '.csv'
print(file_name)

Y_final = clf.predict(X_test)
test['Survived'] = Y_final
df_out = test[['PassengerId', 'Survived']]
#df_out.head(10)
df_out.to_csv(os.path.join('../output', file_name), index=False)

predictions_RF_BootstrapFalse_MinSamplesLeaf3_NEstimators50_MinSamplesSplit10_MaxFeaturesSqrt_MaxDepth6.csv


In [186]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Has_Cabin,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Survived
0,892,3,1,34.5,7.8292,0,1,1,0,1,0,0,0,1,0,0,0,0
1,893,3,0,47.0,7.0,0,2,0,0,0,1,0,0,0,1,0,0,0
2,894,2,1,62.0,9.6875,0,1,1,0,1,0,0,0,1,0,0,0,0
3,895,3,1,27.0,8.6625,0,1,1,0,0,1,0,0,1,0,0,0,0
4,896,3,0,22.0,12.2875,0,3,0,0,0,1,0,0,0,1,0,0,1
5,897,3,1,14.0,9.225,0,1,1,0,0,1,0,0,1,0,0,0,0
6,898,3,0,30.0,7.6292,0,1,1,0,1,0,0,1,0,0,0,0,1
7,899,2,1,26.0,29.0,0,3,0,0,0,1,0,0,1,0,0,0,0
8,900,3,0,18.0,7.2292,0,1,1,1,0,0,0,0,0,1,0,0,1
9,901,3,1,21.0,24.15,0,3,0,0,0,1,0,0,1,0,0,0,0
