In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import svm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data loading and preparation

In [None]:
training = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
all_data = pd.concat([training,test])
training_prepared = training.copy()
training_prepared = training_prepared.replace({'Sex': { 'male': 0, 'female': 1 }})
test_prepared = test.copy().replace({'Sex': { 'male': 0, 'female': 1 }})

%matplotlib inline
all_data.columns

# Data analysis

In [None]:
training_prepared

In [None]:
training.describe()

In [None]:
training_prepared.describe()

In [None]:
# Separate numerical and categorical values
df_num = training_prepared[['Age','SibSp','Parch','Fare','Sex','Pclass']]
df_numsv = training_prepared[['Age','SibSp','Parch','Fare','Sex','Pclass','Survived']]
numsv = df_numsv.to_numpy()
# remove all rows that contain NaN values
numsv = numsv[~np.isnan(numsv).any(axis=1)]
              
fdf = test_prepared[['Fare', 'Sex', 'Pclass']].to_numpy()
# use 0s instead of nan (for testing data)
fdf = np.nan_to_num(fdf, copy=False)

In [None]:
for i in df_numsv.columns:
    plt.hist(df_numsv[i])
    plt.title(i)
    plt.show()

In [None]:
print(df_numsv.corr())
sns.heatmap(df_numsv.corr())

As can be seen Fare & Sex have a positive correlation with survivability, whilst Pclass is having a negative one

In [None]:
# Survival rates across Age, SibSp, Parch, Fare, Sex and Pclass 
pd.pivot_table(training_prepared, index = 'Survived', values = ['Age','SibSp','Parch','Fare','Sex','Pclass'])

In [None]:
print(numsv)

# Classifiers fitting and comparison

Code is mostly taken from: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=3),
    MLPClassifier(alpha=1, max_iter=10000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

# Select Fare, Sex, and Pclass as features
#   and Survived as target
X, y = numsv[:,-4:-1], numsv[:,-1:].T[0]

figure = plt.figure(figsize=(27, 9))
i = 1
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.5)

mms = []
for r in range(X.shape[1]):
    mms.append(np.array([X[:, r].min() - .5, X[:, r].max() + .5]))
mg = np.meshgrid(*[np.arange(*m, h) for m in mms])

# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(2, (len(classifiers) + 1 + 1) // 2, i, projection='3d')
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], X_train[:, 2], c=y_train, cmap=cm_bright,
           edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], X_test[:, 2], c=y_test, cmap=cm_bright, alpha=0.6,
           edgecolors='k')
ax.set_xlim(mms[0].min(), mms[0].max())
ax.set_ylim(mms[1].min(), mms[1].max())
ax.set_zlim(mms[2].min(), mms[2].max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_zticks(())
i += 1

# iterate over classifiers
for name, clf in zip(names, classifiers):
    ax = plt.subplot(2, (len(classifiers) + 1 + 1) // 2, i, projection='3d')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f'Classifier: {name};  score: {score}')
    matches = (clf.predict(X_test) == y_test)

    # Plot the decision boundary. For that, we will assign a color to each
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(np.c_[mms[0].ravel(), mms[1].ravel(), mms[2].ravel()])
    else:
        Z = clf.predict_proba(np.c_[mms[0].ravel(), mms[1].ravel(), mms[2].ravel()])[:, 1]

    # Plot the training points
#     ax.scatter(X_train[:, 0], X_train[:, 1], X_train[:, 2], c=y_train, cmap=cm_bright,
#                edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], X_test[:, 2], c=matches, cmap='PiYG',
               edgecolors='k', alpha=0.5)

    ax.set_xlim(mms[0].min(), mms[0].max())
    ax.set_ylim(mms[1].min(), mms[1].max())
    ax.set_zlim(mms[2].min(), mms[2].max())
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_zticks(())
    ax.set_title(f'{name} ({i - 2})')
    ax.text(mms[0].max() - .3, mms[1].min() + .3, mms[2].min() + .3, ('%.3f' % score).lstrip('0'),
            size=15, horizontalalignment='right')
    i += 1

plt.tight_layout()
plt.show()

# Survivors prediction

In [None]:
# Select "Gaussian Process" classifier
gcp = classifiers[3]
X = StandardScaler().fit_transform(fdf)
final_prediction = gcp.predict(X)
ff = np.int32(np.dstack((test['PassengerId'].to_numpy(), final_prediction))[0])
final_df = pd.DataFrame(data=ff, columns=['PassengerId', 'Survived'])
print(final_df)
with open('result.csv', 'w') as f:
    f.write(final_df.to_csv(index=False))
