# Machine Learning

## Configuration

In [None]:
# Jupyter config
%load_ext rpy2.ipython
%matplotlib inline
%config InlineBackend.figure_format = 'svg'  # Or 'retina'

In [None]:
# Python imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import *
from sklearn.preprocessing import *
from sklearn.mixture import *

#plt.style.use('seaborn-whitegrid')  # Set the aesthetic style of the plots

## Preprocessing

In [None]:
training_data = pd.read_csv('train_processed.csv')
test_data = pd.read_csv('test_processed.csv')

In [None]:
training_data

In [None]:
label_encoders = {
    'Sex': LabelEncoder(),
    'Ticket': LabelEncoder(),
    'Embarked': LabelEncoder(),
    'NameTitle': LabelEncoder(),
    'Deck': LabelEncoder(),
}
for feature, label_encoder in label_encoders.items():
    label_encoder.fit(pd.concat((training_data[feature], test_data[feature])))
    training_data[feature] = label_encoder.transform(training_data[feature])
    test_data[feature] = label_encoder.transform(test_data[feature])

In [None]:
training_data

## Selecting Features

We will initally select the features which we believe would most affect the survival odds of an individual aboard the titanic

In [None]:
training_data = training_data.drop(['FirstName', 'MiddleNames', 'LastName', 'Sex', 'Ticket', 'CabinNumber', 'SibSp', 'Parch', 'NameTitle'], axis=1) 
test_data = test_data.drop(['FirstName', 'MiddleNames', 'LastName', 'Sex', 'Ticket', 'CabinNumber', 'SibSp', 'Parch', 'NameTitle'], axis=1) 

In [None]:
training_data

#### We decide to keep the following features:

* <b>PClass</b> - the class of the ticket, as we all know this had a large say in deciding who got on the escape boats
* <b>Age</b> - An older person is weaker than a younger one on average.
* <b>Fare</b> - Someone who paid a lot more money would be in a far different position than someone who did not
* <b>Embarked</b> - Depending on the port they got on, (might play a role, not sure.. might get rid of this in other attempt)
* <b>Deck</b> - The deck of the boat the person was staying is important when a boat is floating
* <b>FamilySize</b> - If an individual had a family it is possible that they gave up their spot on an escape boat or attempted to rescue them
* <b>FarePerPerson</b> - The amount paid per person (based on family size) could indicate how they were treated

In [None]:
features = training_data.columns[3:]
train_true = training_data.columns[2:3]

X = training_data[features]
y = training_data[train_true]
y = np.asarray(y).reshape(-1)

features

## Classifier Decision

In [None]:
LabelEncoder()