# Machine Learning

## Configuration

In [None]:
# Jupyter config
%matplotlib inline
%config InlineBackend.figure_format = 'svg'  # Or 'retina'

In [None]:
# Python imports
from collections import defaultdict
from itertools import chain, combinations
from typing import *

import numpy as np
import numpy.typing as npt
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from scipy.stats import *
from sklearn.preprocessing import *
from sklearn.mixture import *
from tqdm.notebook import tqdm

#plt.style.use('seaborn-whitegrid')  # Set the aesthetic style of the plots

## Preprocessing

In [None]:
training_data = pd.read_csv('train_processed.csv')
test_data = pd.read_csv('test_processed.csv')

training_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
training_data

In [None]:
label_encoders = {
    'Sex': LabelEncoder(),
    'Ticket': LabelEncoder(),
    'Embarked': LabelEncoder(),
    'NameTitle': LabelEncoder(),
    'FirstName': LabelEncoder(),
    'MiddleNames': LabelEncoder(),
    'LastName': LabelEncoder(),
    'Deck': LabelEncoder(),
}
for feature, label_encoder in label_encoders.items():
    label_encoder.fit(pd.concat((training_data[feature], test_data[feature])))
    training_data[feature] = label_encoder.transform(training_data[feature])
    test_data[feature] = label_encoder.transform(test_data[feature])

In [None]:
training_data

## Selecting Features

We will initally select the features which we believe would most affect the survival odds of an individual aboard the titanic

#### We decide to keep the following features:

* <b>PClass</b> - the class of the ticket, as we all know this had a large say in deciding who got on the escape boats
* <b>Age</b> - An older person is weaker than a younger one on average.
* <b>Fare</b> - Someone who paid a lot more money would be in a far different position than someone who did not
* <b>Embarked</b> - Depending on the port they got on, (might play a role, not sure.. might get rid of this in other attempt)
* <b>Deck</b> - The deck of the boat the person was staying is important when a boat is floating
* <b>FamilySize</b> - If an individual had a family it is possible that they gave up their spot on an escape boat or attempted to rescue them
* <b>FarePerPerson</b> - The amount paid per person (based on family size) could indicate how they were treated

## Classifier Decision


Using information seen in https://www.kaggle.com/mosleylm/titanic-data-set-exploration/execution#II.-Format-Data we decide that we will test many different classifiers and then select the highest performing one based on the F1 score.
We will use a stratified 10-fold cross validation in order to train and test on all of our data. <br>
We test the following classifiers:

* <b>Gradient Boosting</b>
* <b>Random Forest</b>
* <b>KNeighbors</b>
* <b>SVC</b>
* <b>Decision Tree</b>
* <b>Ada Boost</b>
* <b>GaussianNB</b>
* <b>Logistic Regression</b>

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score, log_loss

In [None]:
class Classifier(NamedTuple):
    name: str
    features: Sequence[str]


class ClassifierResult(NamedTuple):
    test_predictions: npt.ArrayLike
    f1_score: np.float64
    accuracy_score: np.float64


def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def train_classifier(selected_features, train_true, classifier, num_splits=10) -> ClassifierResult:
    X = training_data[list(selected_features)]
    X = X.values
    y = training_data[train_true]
    y = np.asarray(y).reshape(-1)

    splitter = StratifiedShuffleSplit(n_splits=num_splits, test_size=0.1, random_state=0)

    average_f1_score = np.float64()
    average_accuracy_score = np.float64()
    for train_idx, test_idx in splitter.split(X, y):  # 10 folds
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = y[train_idx], y[test_idx]
        
        classifier.fit(X_train, Y_train)

        test_predictions = classifier.predict(X_test)
        average_f1_score += f1_score(Y_test, test_predictions)
        average_accuracy_score += accuracy_score(Y_test, test_predictions)

    average_f1_score /= num_splits
    average_accuracy_score /= num_splits

    return ClassifierResult(test_predictions=test_predictions,
                            f1_score=average_f1_score,
                            accuracy_score=average_accuracy_score)


def classifiers(selected_features):
    classifiers_list = [
        KNeighborsClassifier(5),
        SVC(probability=True),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        GaussianNB(),
        LogisticRegression(),
    ]

    classifiers_dict: Dict[Classifier, Any] = \
            {Classifier(name=classifier.__class__.__name__, features=tuple(selected_features)): classifier
            for classifier in classifiers_list}

    return classifiers_dict


def train_all_classifiers():
    train_true = ['Survived']
    candidate_features = tuple(filter(lambda feature: feature not in train_true, training_data))

    classifiers_by_f1_score: Dict[Classifier, np.float64] = defaultdict(int)
    classifiers_by_accuracy_score: Dict[Classifier, np.float64] = defaultdict(int)

    classifiers_group_dict = {selected_features: classifiers(selected_features)
                              for selected_features in powerset(candidate_features)
                              if len(selected_features) > 0}

    jobs = (classifier_id
            for selected_features in powerset(candidate_features)
            if len(selected_features) > 0
            for classifier_id in classifiers_group_dict[selected_features].keys())
    job_results = (delayed(train_classifier)(selected_features, train_true, classifier)
                   for selected_features in powerset(candidate_features)
                   if len(selected_features) > 0
                   for classifier in classifiers_group_dict[selected_features].values())
    job_results = Parallel(n_jobs=-1, verbose=10)(job_results)
    
    for classifier_id, results in tqdm(zip(jobs, job_results), total=(2 ** len(candidate_features) - 1) * 8):
        classifiers_by_f1_score[classifier_id] = results.f1_score
        classifiers_by_accuracy_score[classifier_id] += results.accuracy_score

    all_classifiers = {classifier_id: classifier
                       for classifier_dict in classifiers_group_dict.values()
                       for classifier_id, classifier in classifier_dict.items()}

    return pd.DataFrame({
                            'Classifier': all_classifiers.keys(),
                            'F1 Score': [classifiers_by_f1_score[classifier] for classifier in all_classifiers],
                            'Accuracy Score': [classifiers_by_accuracy_score[classifier] for classifier in all_classifiers],
                        },
                        columns=['Classifier', 'F1 Score', 'Accuracy Score'])


In [None]:
train_all_classifiers().to_csv('classifier_stats.csv', index=False)

Based on our results it appears that the <b> 