# Machine Learning

## Configuration

In [None]:
# Jupyter config
%load_ext rpy2.ipython
%matplotlib inline
%config InlineBackend.figure_format = 'svg'  # Or 'retina'

In [None]:
# Python imports
from collections import defaultdict
from itertools import chain, combinations
from typing import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import *
from sklearn.preprocessing import *
from sklearn.mixture import *
from tqdm.notebook import tqdm

#plt.style.use('seaborn-whitegrid')  # Set the aesthetic style of the plots

## Preprocessing

In [None]:
training_data = pd.read_csv('train_processed.csv')
test_data = pd.read_csv('test_processed.csv')

training_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
training_data

In [None]:
label_encoders = {
    'Sex': LabelEncoder(),
    'Ticket': LabelEncoder(),
    'Embarked': LabelEncoder(),
    'NameTitle': LabelEncoder(),
    'FirstName': LabelEncoder(),
    'MiddleNames': LabelEncoder(),
    'LastName': LabelEncoder(),
    'Deck': LabelEncoder(),
}
for feature, label_encoder in label_encoders.items():
    label_encoder.fit(pd.concat((training_data[feature], test_data[feature])))
    training_data[feature] = label_encoder.transform(training_data[feature])
    test_data[feature] = label_encoder.transform(test_data[feature])

In [None]:
training_data

## Selecting Features

We will initally select the features which we believe would most affect the survival odds of an individual aboard the titanic

#### We decide to keep the following features:

* <b>PClass</b> - the class of the ticket, as we all know this had a large say in deciding who got on the escape boats
* <b>Age</b> - An older person is weaker than a younger one on average.
* <b>Fare</b> - Someone who paid a lot more money would be in a far different position than someone who did not
* <b>Embarked</b> - Depending on the port they got on, (might play a role, not sure.. might get rid of this in other attempt)
* <b>Deck</b> - The deck of the boat the person was staying is important when a boat is floating
* <b>FamilySize</b> - If an individual had a family it is possible that they gave up their spot on an escape boat or attempted to rescue them
* <b>FarePerPerson</b> - The amount paid per person (based on family size) could indicate how they were treated

## Classifier Decision


Using information seen in https://www.kaggle.com/mosleylm/titanic-data-set-exploration/execution#II.-Format-Data we decide that we will test many different classifiers and then select the highest performing one based on the F1 score.
We will use a stratified 10-fold cross validation in order to train and test on all of our data. <br>
We test the following classifiers:

* <b>Gradient Boosting</b>
* <b>Random Forest</b>
* <b>KNeighbors</b>
* <b>SVC</b>
* <b>Decision Tree</b>
* <b>Ada Boost</b>
* <b>GaussianNB</b>
* <b>Logistic Regression</b>

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score, log_loss

In [None]:
class Classifier(NamedTuple):
    name: str
    features: Sequence[str]


def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [None]:
num_splits = 10
splitter = StratifiedShuffleSplit(n_splits=num_splits, test_size=0.1, random_state=0)

train_true = ['Survived']
candidate_features = filter(lambda feature: feature not in train_true, training_data)

classifiers_by_f1_score: Dict[Classifier, int] = defaultdict(int)
classifiers_by_accuracy_score: Dict[Classifier, int] = defaultdict(int)

for selected_features in tqdm(tuple(powerset(candidate_features))):
    if len(selected_features) == 0:
        continue
    X = training_data[list(selected_features)]
    X = X.values
    y = training_data[train_true]
    y = np.asarray(y).reshape(-1)

    classifiers = [
        KNeighborsClassifier(5),
        SVC(probability=True),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        GaussianNB(),
        LogisticRegression(),
    ]
    classifiers: Dict[Classifier, Any] = \
        {Classifier(name=classifier.__class__.__name__, features=tuple(selected_features)): classifier
         for classifier in classifiers}

    for train_idx, test_idx in splitter.split(X, y):  # 10 folds
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = y[train_idx], y[test_idx]
        
        for classifier_id, classifier in classifiers.items():
            classifier.fit(X_train, Y_train)
            
            test_predictions = classifier.predict(X_test)
            classifiers_by_f1_score[classifier_id] += f1_score(Y_test, test_predictions)
            classifiers_by_accuracy_score[classifier_id] += accuracy_score(Y_test, test_predictions)

    for classifier_id, classifier in classifiers.items():
        classifiers_by_f1_score[classifier_id] /= float(num_splits)
        classifiers_by_accuracy_score[classifier_id] /= float(num_splits)

log = pd.DataFrame({
                        'Classifier': classifiers.keys(),
                        'F1 Score': [classifiers_by_f1_score[classifier] for classifier in classifiers],
                        'Accuracy Score': [classifiers_by_accuracy_score[classifier] for classifier in classifiers],
                    },
                    columns=['Classifier', 'F1 Score', 'Accuracy Score'])

log

Based on our results it appears that the <b> 