# Titanic - A Kaggle Submission
## By Michael Neiman

In [25]:
import warnings
warnings.filterwarnings('ignore')

## 0. Data Import

In [26]:
import pandas as pd

X_test = pd.read_csv('test.csv', header=0)
X_train = pd.read_csv('train.csv', header=0)
y_train = X_train.pop('Survived')
y_train = pd.to_numeric(y_train, errors='coerce')

## 1. Data Preprocessing

Let's first create copies of the data:

In [27]:
# Create copies of the data:
X_train_preprocessed = X_train.copy()
X_test_preprocessed = X_test.copy()

### 1.1. Dropping Irrelevant or hard to use data:

We will remove the following columns:
- PassengerId
- Name
- Ticket
- Cabin (might use later)

In [28]:
X_test_preprocessed = X_test_preprocessed.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'])
X_train_preprocessed = X_train_preprocessed.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'])

### 1.2. Using an ordinal encoder on the "sex" column:

In [29]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
X_train_preprocessed['Sex'] = ordinal_encoder.fit_transform(X_train_preprocessed[['Sex']])
X_test_preprocessed['Sex'] = ordinal_encoder.transform(X_test_preprocessed[['Sex']])

### 1.3. Using a "One-hot" encoder on the ports of embarkation:

In [30]:
X_preprocessed_total = pd.concat([X_train_preprocessed, X_test_preprocessed], axis=0)

embark_one_hot = pd.get_dummies(X_preprocessed_total['Embarked'], prefix='Embarked')

X_train_preprocessed[embark_one_hot.columns] = embark_one_hot.iloc[0:X_train_preprocessed.shape[0]-1, :]
X_test_preprocessed[embark_one_hot.columns] = embark_one_hot.iloc[0:X_test_preprocessed.shape[0]-1, :]

X_train_preprocessed = X_train_preprocessed.drop(['Embarked'], axis=1)
X_test_preprocessed = X_test_preprocessed.drop(['Embarked'], axis=1)


### 1.4. Imputing columns with missing data:

In [31]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()
X_train_imputed = pd.DataFrame(imp.fit_transform(X_train_preprocessed))
X_test_imputed = pd.DataFrame(imp.fit_transform(X_test_preprocessed))

X_train_imputed.columns = X_train_preprocessed.columns
X_test_imputed.columns = X_test_preprocessed.columns

X_train_preprocessed = X_train_imputed
X_test_preprocessed = X_test_imputed

**_TODO 1:_**  There might be added value in marking some of the missing values as such.
**_TODO 2:_**  The cabin data needs to be parsed, it is a good indicator of passenger location at the moment of disaster.

### 1.5. Scaling the numeric data:

In [32]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler().fit(pd.concat([X_train_preprocessed, X_test_preprocessed], axis=0))

X_train_scaled = pd.DataFrame(scaler.transform(X_train_preprocessed), columns=X_train_preprocessed.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_preprocessed), columns=X_test_preprocessed.columns)

X_train_preprocessed = X_train_scaled
X_test_preprocessed = X_test_scaled

## 2. Training.
Now let's try out some ML techniques which we've encountered:

### 2.1. Training the data on some different classifiers:

We will use the following models for comparison:
- XGBoost
- Decision Tree
- Random Forest
- Logistic Regression

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression(random_state=0).fit(X_train_preprocessed, y_train)
xgb_classifier = XGBClassifier(eval_metric='logloss', random_state=0).fit(X_train_preprocessed, y_train) # eval_metric is specifically mentioned in order to prevent warnings from popping up.
tree_classifier = DecisionTreeClassifier(random_state=0).fit(X_train_preprocessed, y_train)
random_forest_classifier = RandomForestClassifier(random_state=0).fit(X_train_preprocessed, y_train)

## 3. Evaluation

### 3.1. Assessing the cross-validation score of each model:

In [34]:
from sklearn.model_selection import cross_val_score
import numpy as np

score_logistic = cross_val_score(logistic_regressor, X_train_preprocessed, y_train)
score_xgb = cross_val_score(xgb_classifier, X_train_preprocessed, y_train)
score_tree = cross_val_score(tree_classifier, X_train_preprocessed, y_train)
score_random_forest = cross_val_score(random_forest_classifier, X_train_preprocessed, y_train)

print(f"logistic:       {score_logistic}\n"
      f"mean:           {np.mean(score_logistic)}\n"
      f"---")
print(f"xgb:            {score_xgb}\n"
      f"mean:           {np.mean(score_xgb)}\n"
      f"---")
print(f"tree:           {score_tree}\n"
      f"mean:           {np.mean(score_tree)}\n"
      f"---")
print(f"random forest:  {score_random_forest}\n"
      f"mean:           {np.mean(score_random_forest)}\n"
      f"---")

logistic:       [0.77653631 0.78651685 0.78089888 0.76966292 0.82022472]
mean:           0.7867679367271359
---
xgb:            [0.77653631 0.80898876 0.85393258 0.79213483 0.84831461]
mean:           0.815981419873203
---
tree:           [0.73743017 0.78089888 0.79775281 0.74719101 0.80898876]
mean:           0.7744523256543845
---
random forest:  [0.79329609 0.81460674 0.84831461 0.78089888 0.83146067]
mean:           0.8137153976523759
---


It seems as though the best performing classifier, though not by a significant margin, is the XGB Classifier.

### 3.2. Assessing the confusion matrix for each model:

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_preprocessed, y_train, test_size=0.35)
xgb_classifier = XGBClassifier(eval_metric='logloss', random_state=0).fit(X_train_split, y_train_split)
y_preds = xgb_classifier.predict(X_valid_split)

confusion_matrix(y_valid_split, y_preds)

array([[168,  24],
       [ 44,  76]], dtype=int64)

Satisfied with the result, we select the *XGB Classifier* as our candidate for predicting the test set.

## 4. Prediction

In [36]:
logistic_regressor = LogisticRegression(random_state=0).fit(X_train_preprocessed, y_train)
y_preds_submission = logistic_regressor.predict(X_test_preprocessed)

submission = pd.DataFrame({
    'PassengerId': X_test['PassengerId'],
    'Survived': y_preds_submission
})

submission.to_csv('submission.csv')