# Titanic Machine Learning

## Libraries and Tools

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn import metrics

## Open Data

In [4]:
# Open data files and read into dataframes
train = (pd.read_csv('train.csv')).set_index('PassengerId')
X_score = (pd.read_csv('test.csv')).set_index('PassengerId')

In [5]:
# Split training features from class
X_train = train[list(X_score)]
y_train = train[['Survived']]

## Data Cleaning and Feature Selection

### Imputation of Missing Values

In [6]:
## Use median imputation for age
median_age = np.median(X_train['Age'].dropna())
X_train = X_train.fillna({'Age': median_age})
X_score = X_score.fillna({'Age': median_age})

In [7]:
# Only use the letter for the cabin
def cabin_clean(row):
    if not pd.isnull(row['Cabin']):
        return (row['Cabin'])[0]
X_train.Cabin = X_train.apply(cabin_clean, axis=1)
X_score.Cabin = X_score.apply(cabin_clean, axis=1)
# Fill any NaNs with most common cabin letter from training set
cabin_imp_val = X_train.Cabin.value_counts().index[0]
X_train = X_train.fillna({'Cabin': cabin_imp_val})
X_score = X_score.fillna({'Cabin': cabin_imp_val})

In [8]:
# Fill NaNs with most common categorical value
embarked_imp_val = X_train.Embarked.value_counts().index[0]
X_train = X_train.fillna({'Embarked': embarked_imp_val})
X_score = X_score.fillna({'Embarked': embarked_imp_val})

In [9]:
# Use median impuation for fare
median_fare = np.median(X_train['Fare'].dropna())
X_train = X_train.fillna({'Fare': median_fare})
X_score = X_score.fillna({'Fare': median_fare})

### Encode Categorical Features

In [10]:
# Features with presumably no predictive power hsould be dropped rather than encoded
X_train = X_train.drop(['Name', 'Ticket'], axis=1)
X_score = X_score.drop(['Name', 'Ticket'], axis=1)

In [11]:
# One-hot-encode remaining categorical features
X_train = pd.get_dummies(X_train, columns=['Sex', 'Cabin', 'Embarked'])
X_score = pd.get_dummies(X_score, columns=['Sex', 'Cabin', 'Embarked'])
# Add empty column for 'Cabin_T' which was not in score
X_score['Cabin_T'] = 0

In [12]:
# Ensure columns are ordered the same for scikit-learn
X_score = X_score[list(X_train.columns)]

## Model Analysis

### Split Known Data for Training and Testing

In [13]:
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, random_state=42
)

### Perform Cross Validation

In [14]:
classifiers = [
    
]

classifier_names = [
    '',
    ''
]

In [15]:
test_pipeline = Pipeline([
    ('grd', GradientBoostingClassifier(n_estimators=500))
])
test_pipeline.fit(X_train_split, y_train_split.values.ravel())
y_pred_split = test_pipeline.predict(X_test_split)
print(classification_report(y_test_split, y_pred_split))
print(metrics.accuracy_score(y_test_split, y_pred_split))

             precision    recall  f1-score   support

          0       0.83      0.82      0.82       134
          1       0.73      0.74      0.74        89

avg / total       0.79      0.79      0.79       223

0.789237668161


## Train the Gradient Boosting Forest On Full Training Set

In [16]:
# Create the pipeline with a PCA decomp step
pipeline = Pipeline([
    ('pca', PCA()),
    ('grd', GradientBoostingClassifier(n_estimators=500))
])

In [17]:
# Fit the pipeline to the training set
pipeline.fit(X_train, y_train.values.ravel())

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('grd', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=N...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

## Predict the Unknowns

In [18]:
X_score['Survived'] = pipeline.predict(X_score)

In [20]:
X_score[['Survived']].to_csv('submission.csv')