# Machine Learning

## Configuration

In [None]:
# Jupyter config
%load_ext rpy2.ipython
%matplotlib inline
%config InlineBackend.figure_format = 'svg'  # Or 'retina'

In [None]:
# Python imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import *
from sklearn.preprocessing import *
from sklearn.mixture import *

#plt.style.use('seaborn-whitegrid')  # Set the aesthetic style of the plots

## Preprocessing

In [None]:
training_data = pd.read_csv('train_processed.csv')
test_data = pd.read_csv('test_processed.csv')

In [None]:
training_data

In [None]:
label_encoders = {
    'Sex': LabelEncoder(),
    'Ticket': LabelEncoder(),
    'Embarked': LabelEncoder(),
    'NameTitle': LabelEncoder(),
    'Deck': LabelEncoder(),
}
for feature, label_encoder in label_encoders.items():
    label_encoder.fit(pd.concat((training_data[feature], test_data[feature])))
    training_data[feature] = label_encoder.transform(training_data[feature])
    test_data[feature] = label_encoder.transform(test_data[feature])

In [None]:
training_data

## Selecting Features

We will initally select the features which we believe would most affect the survival odds of an individual aboard the titanic

In [None]:
selected_features = [
    'PassengerId',
    'Survived',
    'Pclass',
    'Age',
    'Embarked',
    'Deck',
    'Fare',
    'FamilySize',
    'FarePerPerson',
]

training_data = training_data.drop(filter(lambda feature: feature not in selected_features, training_data), axis=1)
test_data = test_data.drop(filter(lambda feature: feature not in selected_features, test_data), axis=1) 

In [None]:
training_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
training_data

#### We decide to keep the following features:

* <b>PClass</b> - the class of the ticket, as we all know this had a large say in deciding who got on the escape boats
* <b>Age</b> - An older person is weaker than a younger one on average.
* <b>Fare</b> - Someone who paid a lot more money would be in a far different position than someone who did not
* <b>Embarked</b> - Depending on the port they got on, (might play a role, not sure.. might get rid of this in other attempt)
* <b>Deck</b> - The deck of the boat the person was staying is important when a boat is floating
* <b>FamilySize</b> - If an individual had a family it is possible that they gave up their spot on an escape boat or attempted to rescue them
* <b>FarePerPerson</b> - The amount paid per person (based on family size) could indicate how they were treated

In [None]:
features = ['Pclass', 'Age', 'Fare', 'Embarked', 'Deck', 'FamilySize', 'FarePerPerson']
train_true = ['Survived']

X = training_data[features]
y = training_data[train_true]
y = np.asarray(y).reshape(-1)

features, train_true

## Classifier Decision


Using information seen in https://www.kaggle.com/mosleylm/titanic-data-set-exploration/execution#II.-Format-Data we decide that we will test many different classifiers and then select the highest performing one based on the F1 score.
We will use a stratified 10-fold cross validation in order to train and test on all of our data. <br>
We test the following classifiers:

* <b>Gradient Boosting</b>
* <b>Random Forest</b>
* <b>KNeighbors</b>
* <b>SVC</b>
* <b>Decision Tree</b>
* <b>Ada Boost</b>
* <b>GaussianNB</b>
* <b>Logistic Regression</b>

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score, log_loss

In [None]:
classes = [KNeighborsClassifier(5),SVC(probability=True),DecisionTreeClassifier(),RandomForestClassifier(),
          AdaBoostClassifier(),GradientBoostingClassifier(),GaussianNB(),LogisticRegression()]

splits = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)


log_cs = ['Classifier', 'F1', 'Acc']
log = pd.DataFrame(columns=log_cs)
f1s = {}
acc = {}

X = X.values


for cls in classes:
    clf = cls.__class__.__name__
    f1s[clf] = 0
    acc[clf] = 0


for trn_idx, tst_idx in splits.split(X, y):  # 10 folds
    X_train, X_test = X[trn_idx], X[tst_idx]
    Y_train, Y_test = y[trn_idx], y[tst_idx]
    
    for cls in classes:
        name = cls.__class__.__name__
        cls.fit(X_train, Y_train)
        train_preds = cls.predict(X_test)
        
        f1 = f1_score(Y_test, train_preds)
        acc_sc = accuracy_score(Y_test, train_preds)

    
        f1s[name] += f1
        acc[name] += acc_sc

for cls in f1s:
    f1s[cls] = f1s[cls] / 10.0
    acc[cls] = acc[cls] / 10.0
    log_ = pd.DataFrame([[cls, f1s[cls], acc[cls]]], columns=log_cs)
    log = log.append(log_)

log

Based on our results it appears that the <b> 