In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler

# use random forest as the model, benfits from not needing the data scaled 
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

In [2]:
train = pd.read_csv( '../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
full_data = [train, test]

# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Feature engineering steps taken from Sina
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()  # counts the number of NaN is the Age column
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # fill missing ages with random ages around mean age (questionable...)
    dataset.loc[dataset['Age'].isnull(), 'Age'] = age_null_random_list  
    dataset['Age'] = dataset['Age'].astype(int)

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].astype("category").cat.codes
    
    # Mapping Embarked (maybe create dummies ?)
    # dataset['Embarked'] = dataset['Embarked'].astype("category").cat.codes
    # dataset = pd.get_dummies(dataset, columns=['Embarked'])  # this friggin has no effect, WHY!!??

# why the hell does this not work if done in the above for loop (it has no effect)
train = pd.get_dummies(train, columns=['Embarked'])
test = pd.get_dummies(test, columns=['Embarked'])
    

In [4]:
drop_elements = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
train = train.drop(drop_elements, axis = 1)
test = test.drop(drop_elements, axis = 1)

train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Has_Cabin,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,22,7.25,0,2,0,0,0,1
1,2,1,1,0,38,71.2833,1,2,0,1,0,0
2,3,1,3,0,26,7.925,0,1,1,0,0,1
3,4,1,1,0,35,53.1,1,2,0,0,0,1
4,5,0,3,1,35,8.05,0,1,1,0,0,1
5,6,0,3,1,40,8.4583,0,1,1,0,1,0
6,7,0,1,1,54,51.8625,1,1,1,0,0,1
7,8,0,3,1,2,21.075,0,5,0,0,0,1
8,9,1,3,0,27,11.1333,0,3,0,0,0,1
9,10,1,2,0,14,30.0708,0,2,0,1,0,0


In [5]:
#wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'FamilySize', 'IsAlone',\
                   #'Embarked_C', 'Embarked_Q', 'Embarked_S']
#wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'IsAlone']
wanted_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Has_Cabin', 'FamilySize']

X_train = train[wanted_features]
X_test = test[wanted_features]

Y_train = train['Survived']
Y_test = truth['Survived']

X_train.head(10)

Unnamed: 0,Pclass,Sex,Age,Fare,Has_Cabin,IsAlone
0,3,1,22,7.25,0,0
1,1,0,38,71.2833,1,0
2,3,0,26,7.925,0,1
3,1,0,35,53.1,1,0
4,3,1,35,8.05,0,1
5,3,1,40,8.4583,0,1
6,1,1,54,51.8625,1,1
7,3,1,2,21.075,0,0
8,3,0,27,11.1333,0,0
9,2,0,14,30.0708,0,0


In [139]:
min_

clf = RandomForestClassifier()


In [140]:
clf.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=1e-08, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(4, 2), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=10000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=1e-08, validation_fraction=0.1,
       verbose=1, warm_start=False)

In [141]:
Y_pred = clf.predict(X_test)

n_train = len(Y_train)
n_test = len(Y_test)
train_frac = Y_train.sum()/n_train
pred_frac = Y_pred.sum()/n_train
test_frac = Y_test.sum()/n_train

print('test set contains %i samples' % n_test)

print('%2.2f%% survived in training set' % (100*train_frac))
print('%2.2f%% survived in test set' % (100*test_frac))
print('%2.2f%% survived in predicted test set' % (100*pred_frac))

acc = accuracy_score(Y_test, Y_pred)
print('Test Accuracy: %2.2f%%' % (100*acc))

test set contains 418 samples
38.38% survived in training set
17.06% survived in test set
15.38% survived in predicted test set
Test Accuracy: 93.06%


In [142]:
cm = confusion_matrix(Y_test, Y_pred)
f_score = f1_score(Y_test, Y_pred)
print('Confusion Matrix:')
print(cm)

print('\nin percent per row:')
print(100*cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis])


print('\nF-Score: %2.4f' % (f_score))

Confusion Matrix:
[[259   7]
 [ 22 130]]

in percent per row:
[[ 97.36842105   2.63157895]
 [ 14.47368421  85.52631579]]

F-Score: 0.8997
