In [64]:
df = pandas.read_csv("titanic_alternative.csv")

In [65]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

In [66]:
# Mr == 1
# Mrs == 2
# Miss == 3
# Master == 4
X['Name'] = X['Name'].apply(lambda x: 1 if "Mr." in x else 2 if "Mrs." in x else 3 if "Miss." in x else 4)

X['Sex'] = X['Sex'].factorize()[0]

In [67]:
X['AdultWithoutChildren'] = X.apply(lambda x: 1 if x['Parents/Children Aboard'] == 0 and x['Siblings/Spouses Aboard'] == 1 else 0.1, axis=1)

In [68]:
X['TravelingAlone'] = X.apply(lambda x: 1 if x['Parents/Children Aboard'] == 0 and x['Siblings/Spouses Aboard'] == 0 else 0.1, axis=1)

In [69]:
X['IsTravelingWithSiblings'] = X.apply(lambda x: 1 if x['Parents/Children Aboard'] > 0 and x['Siblings/Spouses Aboard'] > 1 else 0.1, axis=1)

In [70]:
X['TravelingWithChildren'] = X.apply(lambda x: 1 if x['Parents/Children Aboard'] > 2 else 0.1, axis=1)

In [71]:
age = X.dropna(subset=['Age'])
age_mean = age['Age'].mean()
adult_without_children_mean = age['Age'][age['AdultWithoutChildren'] == 1].mean()
traveling_alone = age['Age'][age['TravelingAlone'] == 1].mean()
is_traveling_with_siblings = age['Age'][age['IsTravelingWithSiblings'] == 1].mean()
traveling_with_children = age['Age'][age['TravelingWithChildren'] == 1].mean()

print("Age mean: ", age_mean)
print("Adults without children mean: ", adult_without_children_mean)
print("Traveling alone mean: ", traveling_alone)
print("Traveling with siblings mean: ", is_traveling_with_siblings)
print("Traveling with children mean: ", traveling_with_children)

Age mean:  29.471443066516347
Adults without children mean:  31.6260162601626
Traveling alone mean:  31.75984990619137
Traveling with siblings mean:  10.080357142857142
Traveling with children mean:  38.86666666666667


In [72]:
from sklearn.metrics import mean_squared_error

age_X_train, age_X_test, age_y_train, age_y_test = train_test_split(age.drop('Age', axis=1), age['Age'], test_size=0.2, random_state=42)

def get_mean_age(a):
    if a['AdultWithoutChildren'] == 1:
        return 32.32
    if a['TravelingAlone'] == 1:
        return 32.33
    if a['IsTravelingWithSiblings'] == 1:
        return 10.03
    if a['TravelingWithChildren'] == 1:
        return 40.0
    return 29.87
    
age_y_pred_with_just_mean = age_X_test.apply(lambda x: 29.87, axis=1)
age_y_pred_with_categories = age_X_test.apply(lambda x: get_mean_age(x), axis=1)

print('Error using just the mean: ', mean_squared_error(age_y_test, age_y_pred_with_just_mean))
print('Error using the categories: ', mean_squared_error(age_y_test, age_y_pred_with_categories))

Error using just the mean:  201.48451235955054
Error using the categories:  184.61235898876402


In [73]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

age = X.dropna(subset=['Age'])

age_X = age.drop(columns=['Age'])
age_X = StandardScaler().fit_transform(age_X)
age_y = age['Age']

age_model = MLPRegressor(solver='adam', random_state=0, hidden_layer_sizes=[2, 2], max_iter=10000, alpha=5)

age_X_train, age_X_test, age_y_train, age_y_test = train_test_split(age_X, age_y, test_size=0.2, random_state=42)

age_model.fit(age_X_train, age_y_train)

errors = mean_squared_error(age_y_test, age_model.predict(age_X_test))

print('Errors: ', errors)

Errors:  137.76559822141124


In [74]:
model = age_model

X_scaled = StandardScaler().fit_transform(X.drop(columns=['Age']))
new_ages = DataFrame(model.predict(X_scaled), columns=['Age'])

X = X.fillna(new_ages)

print(X['Age'].mean())
print(X['Age'].shape)

29.471443066516347
(887,)


In [75]:
X_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

svm = SVC(kernel='poly')
svm.fit(X_train, y_train)
print('SVM: ', accuracy_score(y_test, svm.predict(X_test)))

rf = RandomForestClassifier(n_estimators=1000, max_depth=12, random_state=0)
rf.fit(X_train, y_train)
print('RF: ', accuracy_score(y_test, rf.predict(X_test)))

mlp = MLPClassifier(solver='adam', random_state=0, hidden_layer_sizes=[10, 10, 10], max_iter=10000, alpha=5)
mlp.fit(X_train, y_train)
print('MLP: ', accuracy_score(y_test, mlp.predict(X_test)))


SVM:  0.7808988764044944
RF:  0.7752808988764045
MLP:  0.7471910112359551


In [76]:
from sklearn.metrics import confusion_matrix

predictions = rf.predict(X_test)
[[true_pos, false_pos], [false_neg, true_neg]] = confusion_matrix(y_test, predictions)

print('True positives: ', true_pos)
print('False positives: ', false_pos)
print('True negatives: ', true_neg)
print('False negatives: ', false_neg)
print('Accuracy: ', (true_pos + true_neg) / (false_pos + false_neg + true_pos + true_neg))
print('Precision: ', true_pos / (true_pos + false_pos))
print('Recall: ', true_pos / (true_pos + false_neg))

True positives:  92
False positives:  19
True negatives:  46
False negatives:  21
Accuracy:  0.7752808988764045
Precision:  0.8288288288288288
Recall:  0.8141592920353983


In [77]:

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=SVC(), param_grid={'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'degree': np.arange(1, 5), 'C': np.arange(1, 5)}, scoring='f1')
grid.fit(X_train, y_train)

print(grid.best_params_)

svm = grid.best_estimator_
svm.fit(X_train, y_train)
print('Accuracy: ', accuracy_score(y_test, svm.predict(X_test)))

{'C': 1, 'degree': 1, 'kernel': 'rbf'}
Accuracy:  0.7808988764044944
