# Titanic: Machine Learning from Disaster Entry

This notebook was created for a class competition hosted by UC Berkeley's Data 144: Data Mining instructors. The _Submission #_ corresponds to the submissions I entered in the class competition, not the public competition. In the class competition, my highest submission scored .7751 on my 5th submission, which scored .7583 in the public competition. This placed me in the top 10 teams in the class.

Note: This is my initial solo entry before fine-tuning and collaboration. This was later edited by me and my teammates in the class competition to improve this model.


- **Submission 1**: logistic regression
- **Submission 2**: logistic regression, decision tree, neural network - ended up keeping neural network
- **Submission 3, 4, 5**: neural network

# Submission 1

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

df_train.head()

In [None]:
X_train = df_train.drop('Survived', axis=1)
Y_train = df_train['Survived']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train)
X_train_2 = X_train.copy()
X_test_2 = X_test.copy()

### Only using the numeric columns as is.

In [None]:
basic_X_train = X_train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].fillna(0)
model = LogisticRegression()
model.fit(basic_X_train, Y_train)
training_accuracy = model.score(basic_X_train, Y_train)
print("Training Accuracy: ", training_accuracy)

In [None]:
X_test_num = X_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].fillna(0)

training_accuracy = model.score(X_test_num, Y_test)
print("Training Accuracy: ", training_accuracy)

In [None]:
X_train['Sex'] = X_train['Sex'].replace('male', 0).replace('female', 1)
X_train

## Filling NAs

In [None]:
for column in X_train.columns:
    print(column, sum(X_train[column].isna()))

### Embarked

In [None]:
X_train.groupby('Embarked').size()

In [None]:
# Filled the minimal amount of Embarked NAs with the majority class
X_train['Embarked'] = X_train['Embarked'].fillna('S')

### Age

In [None]:
X_train[X_train['Age'].isna()]

In [None]:
avg_miss_age = round(np.mean(X_train[X_train['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train[X_train['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train[X_train['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train[X_train['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train[X_train['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train[X_train['Name'].str.contains('Master.')]['Age']), 2)

In [None]:
X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train['Age'] = X_train_age['Age']

______________________________________________________________________________________________________________

## Train and test accuracies for train.csv

Removing `Cabin` for now, but removing `Name` and `Ticket` permanently.

In [None]:
X_train = X_train.drop(['Name', 'Cabin', 'Ticket'], axis=1)
X_train = pd.get_dummies(X_train)
X_train.head()

In [None]:
model = LogisticRegression()
model.fit(X_train, Y_train)
training_accuracy = model.score(X_train, Y_train)
print("Training Accuracy: ", training_accuracy)

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name', 'Cabin', 'Ticket'], axis=1).fillna(0)
X_test_copy = pd.get_dummies(X_test_copy)

In [None]:
test_accuracy = model.score(X_test_copy, Y_test)
print("Test Accuracy: ", test_accuracy)

## test.csv

In [None]:
df_test_copy = df_test.copy()
df_test_copy['Sex'] = df_test_copy['Sex'].replace('male', 0).replace('female', 1)
df_test_copy['Embarked'] = df_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Master.')]['Age']), 2)

df_test_age = df_test.mask(df_test['Name'].str.contains('Miss.') & df_test['Age'].isna(), avg_miss_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Ms.') & df_test['Age'].isna(), avg_ms_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mrs.') & df_test['Age'].isna(), avg_mrs_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mr.') & df_test['Age'].isna(), avg_mr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Dr.') & df_test['Age'].isna(), avg_dr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Master.') & df_test['Age'].isna(), avg_dr_age)
df_test_copy['Age'] = df_test_age['Age']

df_test_copy = df_test_copy.drop(['Name', 'Cabin', 'Ticket'], axis=1)
df_test_copy = pd.get_dummies(df_test_copy).fillna(0)

In [None]:
test_predictions = model.predict(df_test_copy)
test_predictions

In [None]:
submission = pd.DataFrame(index=df_test.PassengerId)
submission['Survived'] = model.predict(df_test_copy)
submission

In [None]:
# submission.reset_index().to_csv('submission.csv', index=False)

______________________________________________________________________________________________________________

# Submission 2

## Adding `Cabin` and filled NAs 

In [None]:
X_train = X_train_2
X_test = X_test_2

X_train_copy = X_train.copy()
X_train_copy['Sex'] = X_train_copy['Sex'].replace('male', 0).replace('female', 1)
X_train_copy['Embarked'] = X_train_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Master.')]['Age']), 2)

X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train_copy['Age'] = X_train_age['Age']

X_train_copy = X_train_copy.drop(['Name', 'Cabin', 'Ticket'], axis=1)
X_train_copy = pd.get_dummies(X_train_copy)

X_train_cabin = X_train.mask((X_train['Pclass']==1) & X_train['Cabin'].isna(), 'C')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==3) & X_train_cabin['Cabin'].isna(), 'G')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==2) & X_train_cabin['Cabin'].isna(), 'F')
X_train_cabin = X_train_cabin.mask(X_train_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_train_cabin['Cabin']]
X_train_copy['Cabin'] = cabin
X_train_copy = pd.get_dummies(X_train_copy)

In [None]:
model = LogisticRegression()
model.fit(X_train_copy, Y_train)
training_accuracy = model.score(X_train_copy, Y_train)

train_accuracy = model.score(X_train_copy, Y_train)
print("Train Accuracy: ", train_accuracy)

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name', 'Cabin', 'Ticket'], axis=1)
X_test_copy = pd.get_dummies(X_test_copy)

X_test_cabin = X_test.mask((X_test['Pclass']==1) & X_test['Cabin'].isna(), 'C')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==3) & X_test_cabin['Cabin'].isna(), 'G')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==2) & X_test_cabin['Cabin'].isna(), 'F')
X_test_cabin = X_test_cabin.mask(X_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_test_cabin['Cabin']]
X_test_copy['Cabin'] = cabin
X_test_copy = pd.get_dummies(X_test_copy)

In [None]:
test_accuracy = model.score(X_test_copy, Y_test)
print("Test Accuracy: ", test_accuracy)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train_copy = X_train.copy()
X_train_copy['Sex'] = X_train_copy['Sex'].replace('male', 0).replace('female', 1)
X_train_copy['Embarked'] = X_train_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Master.')]['Age']), 2)

X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train_copy['Age'] = X_train_age['Age']

X_train_copy = X_train_copy.drop(['Name', 'Cabin', 'Ticket'], axis=1)
X_train_copy = pd.get_dummies(X_train_copy)

X_train_cabin = X_train.mask((X_train['Pclass']==1) & X_train['Cabin'].isna(), 'C')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==3) & X_train_cabin['Cabin'].isna(), 'G')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==2) & X_train_cabin['Cabin'].isna(), 'F')
X_train_cabin = X_train_cabin.mask(X_train_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_train_cabin['Cabin']]
X_train_copy['Cabin'] = cabin
X_train_copy = pd.get_dummies(X_train_copy)

clf = DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X_train_copy, Y_train)
Y_pred = clf.predict(X_train_copy)
print(accuracy_score(Y_train, Y_pred))

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name', 'Cabin', 'Ticket'], axis=1)
X_test_copy = pd.get_dummies(X_test_copy)

X_test_cabin = X_test.mask((X_test['Pclass']==1) & X_test['Cabin'].isna(), 'C')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==3) & X_test_cabin['Cabin'].isna(), 'G')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==2) & X_test_cabin['Cabin'].isna(), 'F')
X_test_cabin = X_test_cabin.mask(X_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_test_cabin['Cabin']]
X_test_copy['Cabin'] = cabin
X_test_copy = pd.get_dummies(X_test_copy)

Y_pred = clf.predict(X_test_copy)
print(accuracy_score(Y_test, Y_pred))

## Adding `Ticket` - Decision Tree

In [None]:
X_train_try2 = X_train.copy()
for index in np.array(X_train_try2.index):
    X_train_try2['Ticket'][index] = re.sub('[\d]','', X_train_try2['Ticket'][index]).replace('.', '').replace(' ', '')
    X_train_try2['Ticket'][index] = re.sub('Paris', 'PARIS', X_train_try2['Ticket'][index])
    X_train_try2['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_train_try2['Ticket'][index])
    X_train_try2['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_train_try2['Ticket'][index])
    X_train_try2['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_train_try2['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'W/C', 'SC/PARIS']
    if sum([X_train_try2['Ticket'][index] == tic for tic in most_common]) != 1:
        X_train_try2['Ticket'][index] = ''

In [None]:
X_train_copy = X_train.copy()
X_train_copy['Sex'] = X_train_copy['Sex'].replace('male', 0).replace('female', 1)
X_train_copy['Embarked'] = X_train_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Master.')]['Age']), 2)

X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train_copy['Age'] = X_train_age['Age']

X_train_copy = X_train_copy.drop(['Name'], axis=1)

X_train_cabin = X_train.mask((X_train['Pclass']==1) & X_train['Cabin'].isna(), 'C')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==3) & X_train_cabin['Cabin'].isna(), 'G')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==2) & X_train_cabin['Cabin'].isna(), 'F')
X_train_cabin = X_train_cabin.mask(X_train_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_train_cabin['Cabin']]
X_train_copy['Cabin'] = cabin

for index in np.array(X_train_copy.index):
    X_train_copy['Ticket'][index] = re.sub('[\d]','', X_train_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_train_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_train_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_train_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_train_copy['Ticket'][index] = ''

X_train_copy = pd.get_dummies(X_train_copy)

clf = clf.fit(X_train_copy, Y_train)
Y_pred = clf.predict(X_train_copy)
print(accuracy_score(Y_train, Y_pred))

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name'], axis=1)

X_test_cabin = X_test.mask((X_test['Pclass']==1) & X_test['Cabin'].isna(), 'C')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==3) & X_test_cabin['Cabin'].isna(), 'G')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==2) & X_test_cabin['Cabin'].isna(), 'F')
X_test_cabin = X_test_cabin.mask(X_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_test_cabin['Cabin']]
X_test_copy['Cabin'] = cabin

for index in np.array(X_test_copy.index):
    X_test_copy['Ticket'][index] = re.sub('[\d]','', X_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_test_copy['Ticket'][index] = ''

X_test_copy = pd.get_dummies(X_test_copy)

Y_pred = clf.predict(X_test_copy)
print(accuracy_score(Y_test, Y_pred))

## Neural Network (submission #2)

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=300, solver='lbfgs', 
                    verbose=1, random_state=42, activation='tanh')
                                                                
clf.fit(X_train_copy, Y_train)

print('Accuracy on training---')
y_pred_train = clf.predict(X_train_copy)
print(accuracy_score(Y_train, y_pred_train))

print('Accuracy on testing---')
y_pred_test = clf.predict(X_test_copy)
print(accuracy_score(Y_test, y_pred_test))

___

# Submission 3

## Neural Network

Only difference between this neural network versus the last one is `max_iter` is set to 600 instead of 300.

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=600, solver='lbfgs', 
                    verbose=1, random_state=42, activation='tanh')
                                                                
clf.fit(X_train_copy, Y_train)

print('Accuracy on training---')
y_pred_train = clf.predict(X_train_copy)
print(accuracy_score(Y_train, y_pred_train))

print('Accuracy on testing---')
y_pred_test = clf.predict(X_test_copy)
print(accuracy_score(Y_test, y_pred_test))

In [None]:
df_test_copy = df_test.copy()
df_test_copy['Sex'] = df_test_copy['Sex'].replace('male', 0).replace('female', 1)
df_test_copy['Embarked'] = df_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Master.')]['Age']), 2)

df_test_age = df_test.mask(df_test['Name'].str.contains('Miss.') & df_test['Age'].isna(), avg_miss_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Ms.') & df_test['Age'].isna(), avg_ms_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mrs.') & df_test['Age'].isna(), avg_mrs_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mr.') & df_test['Age'].isna(), avg_mr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Dr.') & df_test['Age'].isna(), avg_dr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Master.') & df_test['Age'].isna(), avg_dr_age)
df_test_copy['Age'] = df_test_age['Age']

df_test_copy = df_test_copy.drop(['Name'], axis=1)

df_test_cabin = df_test.mask((df_test['Pclass']==1) & df_test['Cabin'].isna(), 'C')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==3) & df_test_cabin['Cabin'].isna(), 'G')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==2) & df_test_cabin['Cabin'].isna(), 'F')
df_test_cabin = df_test_cabin.mask(df_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in df_test_cabin['Cabin']]
df_test_copy['Cabin'] = cabin

for index in np.array(df_test_copy.index):
    df_test_copy['Ticket'][index] = re.sub('[\d]','', df_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    df_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', df_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([df_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        df_test_copy['Ticket'][index] = ''

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)

In [None]:
# Neural Network
test_predictions = clf.predict(df_test_copy)
test_predictions

In [None]:
submission = pd.DataFrame(index=df_test.PassengerId)
submission['Survived'] = clf.predict(df_test_copy)
submission['Survived'].value_counts()

In [None]:
# submission.reset_index().to_csv('submission.csv', index=False)

## Summary

### Version 1
_submission 1_
* Training Accuracy:  0.8038922155688623
* Test Accuracy:  0.7937219730941704

### Version 2 (logistic regression)
* Train Accuracy:  0.8053892215568862
* Test Accuracy:  0.8116591928251121

### Version 2 (decision tree)
* Train Accuracy: 0.8293413173652695
* Test Accuracy: 0.8026905829596412

### Version 2 (neural network w/ max_iter=300)
_submission 2_
* Train Accuracy: 0.8188622754491018
* Train Accuracy: 0.820627802690583

### Version 3 (neural network w/ max_iter=600)
_submission 3_
* Train Accuracy: 0.8338323353293413
* Test Accuracy: 0.8385650224215246

___

# Submission 4

### Log of `Fare`, log of `PassengerId`

Initially tried log of `Age` too, but it lowered the accuracy. Initially thought `PassengerId` would not change anything, but logging it increased the accuracy.

In [None]:
X_train_copy = X_train.copy()
X_train_copy['Sex'] = X_train_copy['Sex'].replace('male', 0).replace('female', 1)
X_train_copy['Embarked'] = X_train_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Master.')]['Age']), 2)

X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train_copy['Age'] = X_train_age['Age']

X_train_copy = X_train_copy.drop(['Name'], axis=1)

X_train_cabin = X_train.mask((X_train['Pclass']==1) & X_train['Cabin'].isna(), 'C')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==3) & X_train_cabin['Cabin'].isna(), 'G')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==2) & X_train_cabin['Cabin'].isna(), 'F')
X_train_cabin = X_train_cabin.mask(X_train_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_train_cabin['Cabin']]
X_train_copy['Cabin'] = cabin

for index in np.array(X_train_copy.index):
    X_train_copy['Ticket'][index] = re.sub('[\d]','', X_train_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_train_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_train_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_train_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_train_copy['Ticket'][index] = ''

X_train_copy = pd.get_dummies(X_train_copy).fillna(0)

# New
X_train_copy['Fare'] = np.log(X_train_copy['Fare']).replace(-np.inf,0)
X_train_copy['PassengerId'] = np.log(X_train_copy['PassengerId']).replace(-np.inf,0)


clf = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=600, solver='lbfgs', 
                    verbose=1, random_state=42, activation='tanh')
                                                                
clf.fit(X_train_copy, Y_train)

print('Accuracy on training---')
y_pred_train = clf.predict(X_train_copy)
print(accuracy_score(Y_train, y_pred_train))

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name'], axis=1)

X_test_cabin = X_test.mask((X_test['Pclass']==1) & X_test['Cabin'].isna(), 'C')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==3) & X_test_cabin['Cabin'].isna(), 'G')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==2) & X_test_cabin['Cabin'].isna(), 'F')
X_test_cabin = X_test_cabin.mask(X_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_test_cabin['Cabin']]
X_test_copy['Cabin'] = cabin

for index in np.array(X_test_copy.index):
    X_test_copy['Ticket'][index] = re.sub('[\d]','', X_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_test_copy['Ticket'][index] = ''

X_test_copy = pd.get_dummies(X_test_copy).fillna(0)

# New
X_test_copy['Fare'] = np.log(X_test_copy['Fare']).replace(-np.inf,0)
X_test_copy['PassengerId'] = np.log(X_test_copy['PassengerId']).replace(-np.inf,0)

print('Accuracy on testing---')
y_pred_test = clf.predict(X_test_copy)
print(accuracy_score(Y_test, y_pred_test))

In [None]:
df_test_copy = df_test.copy()
df_test_copy['Sex'] = df_test_copy['Sex'].replace('male', 0).replace('female', 1)
df_test_copy['Embarked'] = df_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Master.')]['Age']), 2)

df_test_age = df_test.mask(df_test['Name'].str.contains('Miss.') & df_test['Age'].isna(), avg_miss_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Ms.') & df_test['Age'].isna(), avg_ms_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mrs.') & df_test['Age'].isna(), avg_mrs_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mr.') & df_test['Age'].isna(), avg_mr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Dr.') & df_test['Age'].isna(), avg_dr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Master.') & df_test['Age'].isna(), avg_dr_age)
df_test_copy['Age'] = df_test_age['Age']

df_test_copy = df_test_copy.drop(['Name'], axis=1)

df_test_cabin = df_test.mask((df_test['Pclass']==1) & df_test['Cabin'].isna(), 'C')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==3) & df_test_cabin['Cabin'].isna(), 'G')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==2) & df_test_cabin['Cabin'].isna(), 'F')
df_test_cabin = df_test_cabin.mask(df_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in df_test_cabin['Cabin']]
df_test_copy['Cabin'] = cabin

for index in np.array(df_test_copy.index):
    df_test_copy['Ticket'][index] = re.sub('[\d]','', df_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    df_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', df_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([df_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        df_test_copy['Ticket'][index] = ''

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)   

df_test_copy['Fare'] = np.log(df_test_copy['Fare']).replace(-np.inf,0)
df_test_copy['PassengerId'] = np.log(df_test_copy['PassengerId']).replace(-np.inf,0)

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)

In [None]:
test_predictions = clf.predict(df_test_copy)
test_predictions

In [None]:
submission = pd.DataFrame(index=df_test.PassengerId)
submission['Survived'] = clf.predict(df_test_copy)
submission['Survived'].value_counts()

In [None]:
# submission.reset_index().to_csv('submission_4.csv', index=False)

### Logging `Fare` and `PassengerId` made it lower than all other submissions

______

# Submission 5

### Normalizing all numeric columns instead of using log

In [None]:
X_train_copy = X_train.copy()
X_train_copy['Sex'] = X_train_copy['Sex'].replace('male', 0).replace('female', 1)
X_train_copy['Embarked'] = X_train_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Master.')]['Age']), 2)

X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train_copy['Age'] = X_train_age['Age']

X_train_copy = X_train_copy.drop(['Name'], axis=1)

X_train_cabin = X_train.mask((X_train['Pclass']==1) & X_train['Cabin'].isna(), 'C')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==3) & X_train_cabin['Cabin'].isna(), 'G')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==2) & X_train_cabin['Cabin'].isna(), 'F')
X_train_cabin = X_train_cabin.mask(X_train_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_train_cabin['Cabin']]
X_train_copy['Cabin'] = cabin

for index in np.array(X_train_copy.index):
    X_train_copy['Ticket'][index] = re.sub('[\d]','', X_train_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_train_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_train_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_train_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_train_copy['Ticket'][index] = ''

X_train_copy = pd.get_dummies(X_train_copy).fillna(0)

# New
X_norm = X_train.copy()
X_train_copy['Fare'] = X_norm['Fare'] / X_norm['Fare'].max()
X_train_copy['Age'] = X_norm['Age'] / X_norm['Age'].max()
X_train_copy['PassengerId'] = X_norm['PassengerId'] / X_norm['PassengerId'].max()
X_train_copy['Pclass'] = X_norm['Pclass'] / X_norm['Pclass'].max()
X_train_copy['SibSp'] = X_norm['SibSp'] / X_norm['SibSp'].max()

X_train_copy = X_train_copy.fillna(0)

clf = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=600, solver='lbfgs', 
                    verbose=1, random_state=42, activation='relu') # changed to relu
                                                                
clf.fit(X_train_copy, Y_train)

print('Accuracy on training---')
y_pred_train = clf.predict(X_train_copy)
print(accuracy_score(Y_train, y_pred_train))

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name'], axis=1)

X_test_cabin = X_test.mask((X_test['Pclass']==1) & X_test['Cabin'].isna(), 'C')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==3) & X_test_cabin['Cabin'].isna(), 'G')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==2) & X_test_cabin['Cabin'].isna(), 'F')
X_test_cabin = X_test_cabin.mask(X_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_test_cabin['Cabin']]
X_test_copy['Cabin'] = cabin

for index in np.array(X_test_copy.index):
    X_test_copy['Ticket'][index] = re.sub('[\d]','', X_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_test_copy['Ticket'][index] = ''

X_test_copy = pd.get_dummies(X_test_copy).fillna(0)

# New
X_norm = X_test.copy()
X_test_copy['Fare'] = X_norm['Fare'] / X_norm['Fare'].max()
X_test_copy['Age'] = X_norm['Age'] / X_norm['Age'].max()
X_test_copy['PassengerId'] = X_norm['PassengerId'] / X_norm['PassengerId'].max()
X_test_copy['Pclass'] = X_norm['Pclass'] / X_norm['Pclass'].max()
X_test_copy['SibSp'] = X_norm['SibSp'] / X_norm['SibSp'].max()

X_test_copy = X_test_copy.fillna(0)

print('Accuracy on testing---')
y_pred_test = clf.predict(X_test_copy)
print(accuracy_score(Y_test, y_pred_test))

In [None]:
df_test_copy = df_test.copy()
df_test_copy['Sex'] = df_test_copy['Sex'].replace('male', 0).replace('female', 1)
df_test_copy['Embarked'] = df_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Master.')]['Age']), 2)

df_test_age = df_test.mask(df_test['Name'].str.contains('Miss.') & df_test['Age'].isna(), avg_miss_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Ms.') & df_test['Age'].isna(), avg_ms_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mrs.') & df_test['Age'].isna(), avg_mrs_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mr.') & df_test['Age'].isna(), avg_mr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Dr.') & df_test['Age'].isna(), avg_dr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Master.') & df_test['Age'].isna(), avg_dr_age)
df_test_copy['Age'] = df_test_age['Age']

df_test_copy = df_test_copy.drop(['Name'], axis=1)

df_test_cabin = df_test.mask((df_test['Pclass']==1) & df_test['Cabin'].isna(), 'C')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==3) & df_test_cabin['Cabin'].isna(), 'G')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==2) & df_test_cabin['Cabin'].isna(), 'F')
df_test_cabin = df_test_cabin.mask(df_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in df_test_cabin['Cabin']]
df_test_copy['Cabin'] = cabin

for index in np.array(df_test_copy.index):
    df_test_copy['Ticket'][index] = re.sub('[\d]','', df_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    df_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', df_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([df_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        df_test_copy['Ticket'][index] = ''

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)   

df_norm = df_test.copy()
df_test_copy['Fare'] = X_norm['Fare'] / X_norm['Fare'].max()
df_test_copy['Age'] = X_norm['Age'] / X_norm['Age'].max()
df_test_copy['PassengerId'] = X_norm['PassengerId'] / X_norm['PassengerId'].max()
df_test_copy['Pclass'] = X_norm['Pclass'] / X_norm['Pclass'].max()
df_test_copy['SibSp'] = X_norm['SibSp'] / X_norm['SibSp'].max()

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)

In [None]:
test_predictions = clf.predict(df_test_copy)
test_predictions

In [None]:
submission = pd.DataFrame(index=df_test.PassengerId)
submission['Survived'] = clf.predict(df_test_copy)
submission['Survived'].value_counts()

In [None]:
# submission.reset_index().to_csv('submission_5_10.csv', index=False)

### This is the highest submission so far at a score of 0.77511.

______

# Submission 6

## Forming `Group Size` and `Solo Traveller` columns

Combining `SibSp` and `Parch` to form `Group Size`. `Group Size` will be replaced for those who are marked as a size of 1 (just themself) that have an identical `Ticket` number as others.

In [None]:
X_groups = X_train.copy()
X_groups['Survived'] = Y_train
X_groups['Group Size'] = X_groups['SibSp'] + X_groups['Parch'] + 1 # includes themself
X_groups['Solo Traveler'] = (X_groups['Group Size'] == 1)*1

In [None]:
# People traveling together indicated by identical ticket numbers
df_train.groupby('Ticket').size()

In [None]:
# There are 13 entries marked as solo travelers when they aren't (they share the same ticket # as someone else)
sum(X_groups[X_groups['Solo Traveler'] == 1].groupby('Ticket').size() > 1)

In [None]:
tics = pd.DataFrame(X_groups[X_groups['Solo Traveler'] == 1].groupby('Ticket').size())
identical_tics = tics[tics.iloc[:,0] > 1].index
tic_counts = tics[tics.iloc[:,0] > 1].values
tics[tics.iloc[:,0] > 1].T

In [None]:
X_groups2 = X_groups.copy()
i = 0
for tic in identical_tics:
    X_groups2 = X_groups2.mask((X_groups['Ticket'] == tic), tic_counts[i].item())
    i += 1
X_groups['Group Size'] = X_groups2['Group Size']

sum(X_groups[X_groups['Group Size'] == 1].groupby('Ticket').size() > 1)

In [None]:
# Adjust solo traveler due to group size change
X_groups['Solo Traveler'] = (X_groups['Group Size'] == 1)*1
sum(X_groups[X_groups['Group Size'] > 1]['Solo Traveler'] == 1)

In [None]:
X_groups.groupby(['Group Size']).mean()['Survived']

###  Train & test

In [None]:
X_train_copy = X_train.copy()
X_train_copy['Sex'] = X_train_copy['Sex'].replace('male', 0).replace('female', 1)
X_train_copy['Embarked'] = X_train_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_train_copy[X_train_copy['Name'].str.contains('Master.')]['Age']), 2)

X_train_age = X_train.mask(X_train['Name'].str.contains('Miss.') & X_train['Age'].isna(), avg_miss_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Ms.') & X_train['Age'].isna(), avg_ms_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mrs.') & X_train['Age'].isna(), avg_mrs_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Mr.') & X_train['Age'].isna(), avg_mr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Dr.') & X_train['Age'].isna(), avg_dr_age)
X_train_age = X_train_age.mask(X_train['Name'].str.contains('Master.') & X_train['Age'].isna(), avg_dr_age)
X_train_copy['Age'] = X_train_age['Age']

X_train_copy = X_train_copy.drop(['Name'], axis=1)

X_train_cabin = X_train.mask((X_train['Pclass']==1) & X_train['Cabin'].isna(), 'C')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==3) & X_train_cabin['Cabin'].isna(), 'G')
X_train_cabin = X_train_cabin.mask((X_train_cabin['Pclass']==2) & X_train_cabin['Cabin'].isna(), 'F')
X_train_cabin = X_train_cabin.mask(X_train_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_train_cabin['Cabin']]
X_train_copy['Cabin'] = cabin

for index in np.array(X_train_copy.index):
    X_train_copy['Ticket'][index] = re.sub('[\d]','', X_train_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_train_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_train_copy['Ticket'][index])
    X_train_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_train_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_train_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_train_copy['Ticket'][index] = ''

X_train_copy = pd.get_dummies(X_train_copy).fillna(0)

# New
X_groups = X_train.copy()
X_groups['Group Size'] = X_groups['SibSp'] + X_groups['Parch'] + 1
X_groups['Solo Traveler'] = (X_groups['Group Size'] == 1)*1
tics = pd.DataFrame(X_groups[X_groups['Solo Traveler'] == 1].groupby('Ticket').size())
identical_tics = tics[tics.iloc[:,0] > 1].index
tic_counts = tics[tics.iloc[:,0] > 1].values
X_groups2 = X_groups.copy()
i = 0
for tic in identical_tics:
    X_groups2 = X_groups2.mask((X_groups['Ticket'] == tic), tic_counts[i].item())
    i += 1
X_train_copy['Group Size'] = X_groups2['Group Size']
X_train_copy['Solo Traveler'] = (X_train_copy['Group Size'] == 1)*1

X_train_copy = X_train_copy.drop('Solo Traveler', axis=1)

# Moved normalization
X_norm = X_train_copy
X_train_copy['Fare'] = X_norm['Fare'] / X_norm['Fare'].max()
X_train_copy['Age'] = X_norm['Age'] / X_norm['Age'].max()
X_train_copy['PassengerId'] = X_norm['PassengerId'] / X_norm['PassengerId'].max()
X_train_copy['Pclass'] = X_norm['Pclass'] / X_norm['Pclass'].max()
X_train_copy['SibSp'] = X_norm['SibSp'] / X_norm['SibSp'].max()
X_train_copy['Parch'] = X_norm['Parch'] / X_norm['Parch'].max()
X_train_copy['Group Size'] = X_norm['Group Size'] / X_norm['Group Size'].max()

X_train_copy = X_train_copy.fillna(0)

clf = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=600, solver='lbfgs', 
                    verbose=1, random_state=42, activation='relu')
                                                                
clf.fit(X_train_copy, Y_train)

print('Accuracy on training---')
y_pred_train = clf.predict(X_train_copy)
print(accuracy_score(Y_train, y_pred_train))

In [None]:
X_test_copy = X_test.copy()
X_test_copy['Sex'] = X_test_copy['Sex'].replace('male', 0).replace('female', 1)
X_test_copy['Embarked'] = X_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(X_test_copy[X_test_copy['Name'].str.contains('Master.')]['Age']), 2)

X_test_age = X_test.mask(X_test['Name'].str.contains('Miss.') & X_test['Age'].isna(), avg_miss_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Ms.') & X_test['Age'].isna(), avg_ms_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mrs.') & X_test['Age'].isna(), avg_mrs_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Mr.') & X_test['Age'].isna(), avg_mr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Dr.') & X_test['Age'].isna(), avg_dr_age)
X_test_age = X_test_age.mask(X_test['Name'].str.contains('Master.') & X_test['Age'].isna(), avg_dr_age)
X_test_copy['Age'] = X_test_age['Age']

X_test_copy = X_test_copy.drop(['Name'], axis=1)

X_test_cabin = X_test.mask((X_test['Pclass']==1) & X_test['Cabin'].isna(), 'C')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==3) & X_test_cabin['Cabin'].isna(), 'G')
X_test_cabin = X_test_cabin.mask((X_test_cabin['Pclass']==2) & X_test_cabin['Cabin'].isna(), 'F')
X_test_cabin = X_test_cabin.mask(X_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in X_test_cabin['Cabin']]
X_test_copy['Cabin'] = cabin

for index in np.array(X_test_copy.index):
    X_test_copy['Ticket'][index] = re.sub('[\d]','', X_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    X_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', X_test_copy['Ticket'][index])
    X_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', X_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([X_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        X_test_copy['Ticket'][index] = ''

X_test_copy = pd.get_dummies(X_test_copy).fillna(0)

# New
X_groups = X_test.copy()
X_groups['Group Size'] = X_groups['SibSp'] + X_groups['Parch'] + 1
X_groups['Solo Traveler'] = (X_groups['Group Size'] == 1)*1
tics = pd.DataFrame(X_groups[X_groups['Solo Traveler'] == 1].groupby('Ticket').size())
identical_tics = tics[tics.iloc[:,0] > 1].index
tic_counts = tics[tics.iloc[:,0] > 1].values
X_groups2 = X_groups.copy()
i = 0
for tic in identical_tics:
    X_groups2 = X_groups2.mask((X_groups['Ticket'] == tic), tic_counts[i].item())
    i += 1
X_test_copy['Group Size'] = X_groups2['Group Size']
X_test_copy['Solo Traveler'] = (X_test_copy['Group Size'] == 1)*1

X_test_copy = X_test_copy.drop('Solo Traveler', axis=1)

# Moved normalization
X_norm = X_test_copy
X_test_copy['Fare'] = X_norm['Fare'] / X_norm['Fare'].max()
X_test_copy['Age'] = X_norm['Age'] / X_norm['Age'].max()
X_test_copy['PassengerId'] = X_norm['PassengerId'] / X_norm['PassengerId'].max()
X_test_copy['Pclass'] = X_norm['Pclass'] / X_norm['Pclass'].max()
X_test_copy['SibSp'] = X_norm['SibSp'] / X_norm['SibSp'].max()
X_test_copy['Parch'] = X_norm['Parch'] / X_norm['Parch'].max()
X_test_copy['Group Size'] = X_norm['Group Size'] / X_norm['Group Size'].max()

X_test_copy = X_test_copy.fillna(0)

print('Accuracy on testing---')
y_pred_test = clf.predict(X_test_copy)
print(accuracy_score(Y_test, y_pred_test))

In [None]:
df_test_copy = df_test.copy()
df_test_copy['Sex'] = df_test_copy['Sex'].replace('male', 0).replace('female', 1)
df_test_copy['Embarked'] = df_test_copy['Embarked'].fillna('S')

avg_miss_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Miss.')]['Age']), 2)
avg_ms_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Ms.')]['Age']), 2)
avg_mrs_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mrs.')]['Age']), 2)
avg_mr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Mr.')]['Age']), 2)
avg_dr_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Dr.')]['Age']), 2)
avg_master_age = round(np.mean(df_test_copy[df_test_copy['Name'].str.contains('Master.')]['Age']), 2)

df_test_age = df_test.mask(df_test['Name'].str.contains('Miss.') & df_test['Age'].isna(), avg_miss_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Ms.') & df_test['Age'].isna(), avg_ms_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mrs.') & df_test['Age'].isna(), avg_mrs_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Mr.') & df_test['Age'].isna(), avg_mr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Dr.') & df_test['Age'].isna(), avg_dr_age)
df_test_age = df_test_age.mask(df_test['Name'].str.contains('Master.') & df_test['Age'].isna(), avg_dr_age)
df_test_copy['Age'] = df_test_age['Age']

df_test_copy = df_test_copy.drop(['Name'], axis=1)

df_test_cabin = df_test.mask((df_test['Pclass']==1) & df_test['Cabin'].isna(), 'C')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==3) & df_test_cabin['Cabin'].isna(), 'G')
df_test_cabin = df_test_cabin.mask((df_test_cabin['Pclass']==2) & df_test_cabin['Cabin'].isna(), 'F')
df_test_cabin = df_test_cabin.mask(df_test_cabin['Cabin'].str.contains('T'), 'A')
cabin = [x[0] for x in df_test_cabin['Cabin']]
df_test_copy['Cabin'] = cabin

for index in np.array(df_test_copy.index):
    df_test_copy['Ticket'][index] = re.sub('[\d]','', df_test_copy['Ticket'][index]).replace('.', '').replace(' ', '')
    df_test_copy['Ticket'][index] = re.sub('Paris', 'PARIS', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('SOTON/O', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('STON/OQ', 'STON/O', df_test_copy['Ticket'][index])
    df_test_copy['Ticket'][index] = re.sub('CA/SOTON', 'STON/O', df_test_copy['Ticket'][index])
    most_common = ['PC', 'CA', 'STON/O', 'A/', 'SC/PARIS']
    if sum([df_test_copy['Ticket'][index] == tic for tic in most_common]) != 1:
        df_test_copy['Ticket'][index] = ''

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)   

# New
X_groups = df_test.copy()
X_groups['Group Size'] = X_groups['SibSp'] + X_groups['Parch'] + 1
X_groups['Solo Traveler'] = (X_groups['Group Size'] == 1)*1
tics = pd.DataFrame(X_groups[X_groups['Solo Traveler'] == 1].groupby('Ticket').size())
identical_tics = tics[tics.iloc[:,0] > 1].index
tic_counts = tics[tics.iloc[:,0] > 1].values
X_groups2 = X_groups.copy()
i = 0
for tic in identical_tics:
    X_groups2 = X_groups2.mask((X_groups['Ticket'] == tic), tic_counts[i].item())
    i += 1
df_test_copy['Group Size'] = X_groups2['Group Size']
df_test_copy['Solo Traveler'] = (df_test_copy['Group Size'] == 1)*1

df_test_copy = df_test_copy.drop('Solo Traveler', axis=1)

# Moved normalization
df_norm = df_test_copy
df_test_copy['Fare'] = X_norm['Fare'] / X_norm['Fare'].max()
df_test_copy['Age'] = X_norm['Age'] / X_norm['Age'].max()
df_test_copy['PassengerId'] = X_norm['PassengerId'] / X_norm['PassengerId'].max()
df_test_copy['Pclass'] = X_norm['Pclass'] / X_norm['Pclass'].max()
df_test_copy['SibSp'] = X_norm['SibSp'] / X_norm['SibSp'].max()
df_test_copy['Parch'] = X_norm['Parch'] / X_norm['Parch'].max()
df_test_copy['Group Size'] = X_norm['Group Size'] / X_norm['Group Size'].max()

df_test_copy = pd.get_dummies(df_test_copy).fillna(0)

In [None]:
test_predictions = clf.predict(df_test_copy)
test_predictions

In [None]:
submission = pd.DataFrame(index=df_test.PassengerId)
submission['Survived'] = clf.predict(df_test_copy)
submission['Survived'].value_counts()

In [None]:
# submission.reset_index().to_csv('submission_9.csv', index=False)