# Titanic - Machine Learning Project (UT)
---

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
!pip install -q plotnine
from plotnine import *

---

## Plan

1) **Input Data** - for importing our train and test data set.

2) **Data Exploration** - for exploring our train data set 

3) **Feature Engineering** - for merging and removing columns

4) **Data Processing** - preparing data for model fitting

5) **Building Models:**

    1. Logistic Regression

    2. Random Forest Classifier

    3. KNN Classifier

    4. SVC

    5. XGBClassifier

    6. Neural Network
6) **Ensemble Learning** - Soft voting for final 

7) **Performance metrics** - for advanced measurement
---

### Input Data

In [None]:
os.listdir('inputs/')

In [None]:
train_df = pd.read_csv("./inputs/train.csv")
test_df = pd.read_csv("./inputs/test.csv")

In [None]:
data_list = [train_df, test_df]

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
test_df.info()

---

### Data Exploration

In [None]:
print(f"Train data is {train_df.shape[0]} rows, with {train_df.shape[1]} columns")
print(f"Test data is {test_df.shape[0]} rows, with {test_df.shape[1]} columns")

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
numeric_columns = train_df[['Age','SibSp','Parch','Fare']]
categorical_columns = train_df[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]

In [None]:
for i in numeric_columns.columns:
    plt.hist(numeric_columns[i])
    plt.title(i)
    plt.show()

In [None]:
pd.pivot_table(train_df, index = 'Survived', values = ['Age','SibSp','Parch','Fare'])

In [None]:
pd.pivot_table(train_df, index = 'Survived', columns = 'Pclass', values = 'PassengerId' ,aggfunc ='count')


In [None]:
pd.pivot_table(train_df, index = 'Survived', columns = 'Sex', values = 'PassengerId' ,aggfunc ='count')


In [None]:
pd.pivot_table(train_df, index = 'Survived', columns = 'Embarked', values = 'PassengerId' ,aggfunc ='count')


In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
sns.heatmap(train_df.corr())

In [None]:
for place in train_df['Embarked'].unique()[0:3]:
    place_count = len(train_df[(train_df.Embarked == place)])
    a = len(train_df)
    print(f"Number of people from {place} are {place_count*100/a}%")

### Feature Engineering
---

In [None]:
# Merging Parent Children and Sibling Spouse to FamilyOnBoard
train_df['FamilyOnBoard']= train_df.SibSp + train_df.Parch
test_df['FamilyOnBoard']= test_df.SibSp + test_df.Parch

In [None]:
plot = train_df.plot.scatter('FamilyOnBoard','Survived')

In [None]:
train_df.info()

In [None]:
train_df['cabin_adv'] = train_df.Cabin.apply(lambda x: str(x)[0])
test_df['cabin_adv'] = test_df.Cabin.apply(lambda x: str(x)[0])

In [None]:
pd.pivot_table(train_df,index='Survived',columns='cabin_adv', values = 'Name', aggfunc='count')

### Data Processing

---


In [None]:
train_df = train_df.drop(labels='Parch', axis=1)
test_df = test_df.drop(labels='Parch', axis=1)

train_df = train_df.drop(labels='SibSp', axis=1)
test_df = test_df.drop(labels='SibSp', axis=1)

In [None]:
def fill_empty_Median(dataFrame, column, groupColumns ):
    dataFrame[column] = dataFrame[column].fillna(dataFrame.groupby(groupColumns)[column].transform('median'))
    return dataFrame[column]

In [None]:
fig = plt.figure(figsize = (12,9))
sns.histplot(train_df["Age"], kde=True, palette='BuPu_r')
plt.title('Age hist Before filling')
plt.show()

In [None]:
train_df['Age'] = fill_empty_Median(train_df, "Age", ['FamilyOnBoard', 'Sex', 'Pclass', 'Fare'])
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())

test_df['Age'] = fill_empty_Median(test_df, "Age", ['FamilyOnBoard', 'Sex', 'Pclass', 'Fare'])
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

In [None]:
train_df.info()

In [None]:
fig = plt.figure(figsize = (12,9))
sns.histplot(train_df["Age"], kde=True, palette='BuPu_r')
plt.title('Age hist After filling')
plt.show()

In [None]:
train_df = train_df.drop(labels='Cabin', axis=1)
test_df = test_df.drop(labels='Cabin', axis=1)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
test_df.Fare = test_df.Fare.fillna(train_df.Fare.median())

### Building Models

---

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
features = ['Pclass', "Sex", 'Age', "Fare", "Embarked", "FamilyOnBoard", 'cabin_adv', 'Survived']

#split valid train set into train and validation parts
train_df = pd.get_dummies(train_df[features])

train_df, train_df_val = train_test_split(train_df, random_state = 111, test_size = 0.20)

y = train_df["Survived"]
x = train_df.drop(columns=['Survived'])

y_val = train_df_val["Survived"]
x_val = train_df_val.drop(columns=['Survived'])

features.remove("Survived")

test_x = pd.get_dummies(test_df[features])
test_x.insert(16, 'cabin_adv_T' ,418*[0]) # Solution for (X has 17 features, but LogisticRegression is expecting 18 features as input)

In [None]:
train_df.info()

In [None]:
train_df_val.info()

In [None]:
lr = LogisticRegression(max_iter=2000)
cv = cross_val_score(lr,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")
x.info()

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)

cv = cross_val_score(rfc, x, y, cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
svc = SVC(probability = True)
cv = cross_val_score(svc,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

#### Custom Neural Network

---


In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
n_features = x.columns.size
model.add(Dense(n_features, activation='relu', input_shape=(n_features,)))

model.add(Dense(n_features, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

#model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
                   
model._estimator_type = "classifier"

In [None]:
model.fit(x, y, epochs=50, batch_size=4, verbose=1)

In [None]:
print(model.evaluate(x, y))

#### Ensemble Learning

---



In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = 
                              [
                                  ('lr', lr),
                                  ('knn',knn),
                                  ('svc',svc),
                                  ('xgb',xgb),
                                  ], voting = 'soft') 

cv = cross_val_score(voting_clf,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
voting_clf.fit(x, y)

In [None]:
def results_to_submit(model, test_x, file_name):
    model.fit(x,y)
    results =  model.predict(test_x).astype(int)
    final_data = {'PassengerId': test_df.PassengerId, 'Survived': results}
    submission = pd.DataFrame(data=final_data)
    submission.to_csv(f'./outputs/{file_name}.csv', index=False)

In [None]:
results_to_submit(voting_clf, test_x, 'submission_ensemble' )
results_to_submit(lr, test_x, 'submission_lr' )
results_to_submit(knn, test_x, 'submission_knn' )
results_to_submit(svc, test_x, 'submission_svc' )
results_to_submit(xgb, test_x, 'submission_xgb' )
results_to_submit(rfc, test_x, 'submission_rf' )

In [None]:
model.fit(x,y)
predictions =  model.predict(test_x)
predictions[predictions <= 0.5] = 0
predictions[predictions > 0.5] = 1
predictions

submission = pd.read_csv('./inputs/submission_sample.csv')
submission['Survived'] = predictions
submission.to_csv('./outputs/submission_neural.csv', index=False)

---

### Performance metrics

---


In [None]:
def generate_coordinates(scores, classes, verbose = True):
  # thresholds can be obtained from scores
  thresholds = np.unique(scores)
  # initialise roc_coordinates
  roc_coordinates = pd.DataFrame(columns=['FPR','TPR'], index=thresholds)

  for threshold in thresholds:
    if (verbose == True):
      print(f'For threshold {threshold}')
    
    predictions = scores >= threshold
    predictions[predictions == True] = 1
    predictions[predictions == False] = 0
    pred_positive = classes[predictions == 1].to_numpy().flatten()
    pred_negative = classes[predictions == 0].to_numpy().flatten()
    
    tp = np.sum(pred_positive == 1)
    fn = np.sum(pred_negative == 1)
    tn = np.sum(pred_negative == 0)
    fp = np.sum(pred_positive == 0)
    
    if (verbose == True):
      print(f'tp = {tp}, fn = {fn}, tn = {tn}, fp = {fp}')
    
    tpr = tp/(tp + fn) # the same as recall
    fpr = fp/(fp + tn)
    
    if (verbose == True):
      print(f'FPR = {np.round(fpr, 2)}, TPR = {np.round(tpr, 2)}\n')
    roc_coordinates.loc[threshold] = pd.Series({'FPR':np.round(fpr,2), 'TPR':np.round(tpr,2)})

  return roc_coordinates

In [None]:
def plot_roc(coordinates_dict):
  """
  plot_roc function plots all models' ROCs on one plot
  """
  plotting_data = pd.DataFrame(columns=['FPR', 'TPR', 'Method'])

  for id, name in enumerate(coordinates_dict.keys()):
    method_data = coordinates_dict[name]
    method_data['Method'] = name
    plotting_data = pd.concat([plotting_data, method_data])
  
  plotting_data['Method'] = pd.Categorical(plotting_data['Method'])
  
  # To those of you who are interested in what the hell is going on
  # check the comments for each line:
  roc_plot = (
        ggplot(data = plotting_data, # creates a canvas
        mapping = aes(x = 'FPR', y = 'TPR', colour = 'Method')) + # specifies dimensions
        geom_path(size = 4) + # determines geometric primitive to be visualised (path/line in our case) and its thickness 
        labs(title ='', x = 'FPR', y = 'TPR') + # labels of the x and y axes
        # this is all for the figure, beloow are only formatting specs
        theme_bw() + # colour schema 
        theme(figure_size = (50, 50), # figure size
              axis_line = element_line(size = 1.5, colour = "black"), 
              panel_grid_major = element_line(size = 0.05, colour = "black"),
              panel_grid_minor = element_line(size = 0.05, colour = "black"),
              axis_text = element_text(size = 70, colour ='black')) # more formatting details 
      )
  return roc_plot

In [None]:
from sklearn.metrics import classification_report
def show_metrics(model_list, real_survival_result):
  for model_key in model_list:
    print('-'*40)
    prediction_in_use = model_list[model_key]
    print(f"Classification report for #{model_key}:")
    print(classification_report(real_survival_result, prediction_in_use))
    print('-'*40)

In [None]:
#Fit all models here
rfc.fit(x,y)
knn.fit(x,y)
lr.fit(x,y)
svc.fit(x,y)
xgb.fit(x,y)
model.fit(x,y)
voting_clf.fit(x,y)

#Predict all models here
rfc_prediction = rfc.predict(x_val)
knn_prediction = knn.predict(x_val)
lr_prediction = lr.predict(x_val)
svc_prediction = svc.predict(x_val)
xgb_prediction = xgb.predict(x_val)
model_prediction = model.predict(x_val)
voting_clf_prediction = voting_clf.predict(x_val)

#Prepare list of models with predictions
all_models_predictions = {'RandomForest':rfc_prediction, 
              'KNN':knn_prediction,
              'LinearRegression':lr_prediction,
              'SVC':svc_prediction,
              'XGB':xgb_prediction,
              'Voting':voting_clf_prediction
              }


show_metrics(all_models_predictions, y_val) 

In [None]:
def generate_roc_coordinates(model_dict, data_X , data_Y):
  #case of survival
  positive_class = 1
  val = pd.DataFrame()
  all_roc_coordinates = {}
  for model_key in model_dict:
    print('-'*40)
    print(f"Calculationg prediction probabilities for #{model_key}:")
    model_in_use = model_dict[model_key] 
    val[model_key] = model_in_use.predict_proba(data_X)[:,positive_class]
    print('-'*40)

  for model_key in model_dict:
    # passing each model's prediction probability and real survival value to function
    # to generate roc_coordinate and put it in dictionary
    print('-'*40)
    print(f"Generating ROC coordinates for #{model_key}:")
    all_roc_coordinates[model_key] = generate_coordinates(val[model_key].values,data_Y , verbose=False)
    print('-'*40)

  for model_key in model_dict:
    res = np.round(metrics.roc_auc_score(data_Y,val[model_key]),3)
    print(f"AUC of {model_key} classifier is {res}")

  top_row =  pd.DataFrame({'FPR':0, 'TPR':0}, index=[1.0])
  final_roc_coordinates = {}
  for model_key in model_dict:
    final_roc_coordinates[model_key] = pd.concat([top_row, all_roc_coordinates[model_key]]).astype('float')

  return final_roc_coordinates

In [None]:
all_models = {'RandomForest':rfc, 
              'KNN':knn,
              'LinearRegression':lr,
              'SVC':svc,
              'XGB':xgb
              }
final_roc = generate_roc_coordinates(all_models, x_val, y_val)
plot_roc(dict(final_roc))