# Titanic - A Kaggle Submission
## By Michael Neiman

In [84]:
import warnings
warnings.filterwarnings('ignore')

## 0. Data Import

In [85]:
import pandas as pd

X_test = pd.read_csv('titanic_test.csv', header=0)
X_train = pd.read_csv('titanic_train.csv', header=0)
y_train = X_train.pop('survived')
y_train = pd.to_numeric(y_train, errors='coerce')

## 1. Data Preprocessing

Let's first create copies of the data:

In [86]:
# Create copies of the data:
X_train_preprocessed = X_train.copy()
X_test_preprocessed = X_test.copy()

### 1.1. Dropping Irrelevant or hard to use data:

We will remove the following columns:
- passenger_id
- name
- ticket
- cabin (might use later)
- home.dest

In [87]:
def drop_irrelevant_columns(df: pd.DataFrame):
    df.drop(['passenger_id', 'name', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'], axis=1)
    return df

X_test_preprocessed = X_test_preprocessed.drop(columns = ['passenger_id', 'name', 'ticket', 'cabin', 'home.dest'])
X_train_preprocessed = X_train_preprocessed.drop(columns = ['passenger_id', 'name', 'ticket', 'cabin', 'home.dest'])

### 1.2. Using an ordinal encoder on the "sex" column:

In [88]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
X_train_preprocessed['sex'] = ordinal_encoder.fit_transform(X_train_preprocessed[['sex']])
X_test_preprocessed['sex'] = ordinal_encoder.transform(X_test_preprocessed[['sex']])

### 1.3. Using a "One-hot" encoder on the boat numbers and ports of embarkation:

In [89]:
X_preprocessed_total = pd.concat([X_train_preprocessed, X_test_preprocessed], axis=0)

embark_one_hot = pd.get_dummies(X_preprocessed_total['embarked'], prefix='embarked')
boat_one_hot = pd.get_dummies(X_preprocessed_total['boat'], prefix='boat')

X_train_preprocessed[embark_one_hot.columns] = embark_one_hot.iloc[0:X_train_preprocessed.shape[0]-1, :]
X_train_preprocessed[boat_one_hot.columns] = boat_one_hot.iloc[0:X_train_preprocessed.shape[0]-1, :]
X_test_preprocessed[embark_one_hot.columns] = embark_one_hot.iloc[0:X_test_preprocessed.shape[0]-1, :]
X_test_preprocessed[boat_one_hot.columns] = boat_one_hot.iloc[0:X_test_preprocessed.shape[0]-1, :]

X_train_preprocessed = X_train_preprocessed.drop(['embarked', 'boat'], axis=1)
X_test_preprocessed = X_test_preprocessed.drop(['embarked', 'boat'], axis=1)

### 1.4. Replacing the "body" variable:

The order of the bodies bears no significance and might hinder the performance of our model. Thus, we change it to "1" if a number exists and "0" if it doesn't.

In [90]:
def preprocess_body(data: pd.DataFrame):
    data['body'][data['body'].notna()] = 1
    data['body'][data['body'].isna()] = 0
    return data

X_train_preprocessed = preprocess_body(X_train_preprocessed)
X_test_preprocessed = preprocess_body(X_test_preprocessed)


### 1.5. Imputing columns with missing data:

In [91]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()
X_train_imputed = pd.DataFrame(imp.fit_transform(X_train_preprocessed))
X_test_imputed = pd.DataFrame(imp.fit_transform(X_test_preprocessed))

X_train_imputed.columns = X_train_preprocessed.columns
X_test_imputed.columns = X_test_preprocessed.columns

X_train_preprocessed = X_train_imputed
X_test_preprocessed = X_test_imputed

**_TODO 1:_**  There might be added value in marking some of the missing values as such.
**_TODO 2:_**  The cabin data needs to be parsed, it is a good indicator of passenger location at the moment of disaster.

### 1.6. Scaling the numeric data:

In [92]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler().fit(pd.concat([X_train_preprocessed, X_test_preprocessed], axis=0))

X_train_scaled = pd.DataFrame(scaler.transform(X_train_preprocessed), columns=X_train_preprocessed.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_preprocessed), columns=X_test_preprocessed.columns)

X_train_preprocessed = X_train_scaled
X_test_preprocessed = X_test_scaled

## 2. Training.
Now let's try out some ML techniques which we've encountered:

### 2.1. Training the data on some different classifiers:

We will use the following models for comparison:
- XGBoost
- Decision Tree
- Random Forest
- Logistic Regression

In [93]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression(random_state=0).fit(X_train_preprocessed, y_train)
xgb_classifier = XGBClassifier(eval_metric='logloss', random_state=0).fit(X_train_preprocessed, y_train) # eval_metric is specifically mentioned in order to prevent warnings from popping up.
tree_classifier = DecisionTreeClassifier(random_state=0).fit(X_train_preprocessed, y_train)
random_forest_classifier = RandomForestClassifier(random_state=0).fit(X_train_preprocessed, y_train)

## 3. Evaluation

### 3.1. Assessing the cross-validation score of each model:

In [94]:
from sklearn.model_selection import cross_val_score
import numpy as np

score_logistic = cross_val_score(logistic_regressor, X_train_preprocessed, y_train)
score_xgb = cross_val_score(xgb_classifier, X_train_preprocessed, y_train)
score_tree = cross_val_score(tree_classifier, X_train_preprocessed, y_train)
score_random_forest = cross_val_score(random_forest_classifier, X_train_preprocessed, y_train)

print(f"logistic:       {score_logistic}\n"
      f"mean: {np.mean(score_logistic)}\n"
      f"---")
print(f"xgb:            {score_xgb}\n"
      f"mean: {np.mean(score_xgb)}\n"
      f"---")
print(f"tree:           {score_tree}\n"
      f"mean: {np.mean(score_tree)}\n"
      f"---")
print(f"random forest:  {score_random_forest}\n"
      f"mean: {np.mean(score_random_forest)}\n"
      f"---")

logistic:       [0.96470588 0.97647059 0.94117647 0.95294118 1.        ]
mean: 0.9670588235294117
---
xgb:            [0.95294118 0.90588235 0.87647059 0.91764706 0.91176471]
mean: 0.9129411764705881
---
tree:           [0.92352941 0.87647059 0.90588235 0.90588235 0.93529412]
mean: 0.9094117647058824
---
random forest:  [0.94705882 0.88235294 0.87647059 0.92941176 0.95294118]
mean: 0.9176470588235295
---


It seems as though the best performing classifier, though not by a significant margin, is the logistic regressor. Given that this is a fairly simple and straightforward binary classification problem, it doesn't seem unlikely.

### 3.2. Assessing the confusion matrix for each model:

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_preprocessed, y_train, test_size=0.35)
logistic_regressor = LogisticRegression(random_state=0).fit(X_train_split, y_train_split)
y_preds = logistic_regressor.predict(X_valid_split)

confusion_matrix(y_valid_split, y_preds)

array([[184,   1],
       [  7, 106]], dtype=int64)

Satisfied with the result, we select the *logistic regressor* as our candidate for predicting the test set.

## 4. Prediction

In [96]:
logistic_regressor = LogisticRegression(random_state=0).fit(X_train_preprocessed, y_train)
y_preds_submission = logistic_regressor.predict(X_test_preprocessed)

submission = pd.DataFrame({
    'passenger_id': X_test['passenger_id'],
    'survived': y_preds_submission
})