# 1. Preparation

## 1.1 Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

## 1.2 Load Dataset

In [None]:
df_train = pd.read_csv("../input/titanic/train.csv")
df_test = pd.read_csv("../input/titanic/test.csv")

display(df_train.head())
display(df_test.head())

## 1.3 Checking Null Values

In [None]:
pd.DataFrame(data = [df_train.isna().sum()/df_train.shape[0]*100, df_test.isna().sum()/df_test.shape[0]*100], index=["Train Null (%)", "Test Null (%)"]).T.style.background_gradient(cmap='summer_r')

We can see that Age and Cabin variable has quite a lot of null values in both train and test data. Embarked variable in train data has 2 null values, and Fare variable in test data has 1 null value. We will need to analyze this data first to decide whether to drop this column or do some imputation.

## 1.4 Checking Duplicate Data

In [None]:
print(f"Train data has {df_train.duplicated().sum()} duplicated data")
print(f"Test data has {df_test.duplicated().sum()} duplicated data")

No duplicated data in both train and test data. Looks good.

## 1.5 Checking Dataset Information

In [None]:
df_train.info()

Looks like we have 12 variables, and each variable has 891 records. The types of variables in this dataset are as follows:

**Numerical:**
- **Discrete:**
    1. SibSp
    2. Parch
    <br><br>
    
- **Continous:**
    1. Age
    2. Fare
    <br>
    
**Categorical:**
- **Nominal:**
    1. Sex
    2. Embarked
    3. Survived (Target Variable)
    <br><br>
    
- **Ordinal:**
    1. Pclass
    
**Others:**
- **Text**
    1. Ticket
    2. Name
    <br><br>
    
- **ID:**
    1. PassengerId

# 2. Exploratory Data Analysis

Let's group our variable types and make two functions to visualize our numerical and categorical data first.

In [None]:
num_var = ["SibSp", "Parch", "Age", "Fare"]
cat_var = ["Sex", "Embarked", "Pclass"]
target = "Survived"

In [None]:
def num_dist(data, var):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    sns.histplot(data=data, x=var, kde=True, ax=ax[0])
    sns.boxplot(data=data, x=var, ax=ax[1])
    ax[0].set_title(f"{var} Distribution Histogram")
    ax[1].set_title(f"{var} Distribution Boxplot")

    plt.show()
    
def cat_dist(data, var):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    df_train[var].value_counts().plot(kind="pie", explode=[0.05 for x in data[var].dropna().unique()], autopct='%1.1f%%', ax=ax[0], shadow=True)
    ax[0].set_title(f"{var} Pie Chart")
    ax[0].set_ylabel('')

    count = sns.countplot(x=var, data=df_train, ax=ax[1])
    for bar in count.patches:
        count.annotate(format(bar.get_height()),
            (bar.get_x() + bar.get_width() / 2,
            bar.get_height()), ha='center', va='center',
            size=11, xytext=(0, 8),
            textcoords='offset points')
    ax[1].set_title(f"{var} Bar Chart")
    plt.show()

## 2.1 Checking Target Distribution

In [None]:
cat_dist(df_train, "Survived")

Unfortunately, 61,6% passengers did not survive the fatal accident. Only 342 passengers survived out of a total of 891 passengers in the training data. So, what caused them to survive the accident. We will try to analyze the features in this dataset that might be able to answer this question.

So, Let's begin.

## 2.2 Checking Predictor Distribution

### 2.2.1 Numerical

In [None]:
df_train[num_var].describe()

In [None]:
for var in num_var:
    num_dist(df_train, var)

Most passengers at that time traveled alone without their families. The age of the passengers also varies from 0.42 to 80 years, with an average of 29.7 years. Fares for these passengers also vary, but what's interesting here is that there are passengers who don't have to pay for this cruise. 

As we can see, all of the above variables have outliers. Age variable seems to have almost a normal distribution, but there are some outliers that cause the distribution to be skewed to the right. Other variables appear to have a right skewed distribution because of that.

### 2.2.2 Categorical

In [None]:
for var in cat_var:
    cat_dist(df_train, var)

64.8% of the passengers in this training set are male, while the remaining 35.2% are female. More than 70% of these passengers embarked from port S (Southampton). Very few passengers embarked from port Q (Queenstown), which is only 8.7%, while the rest embarked from port C (Cherbourg). Most of the passengers have 3rd ticket class, while the number of passengers who have 1st and 2nd ticket class is almost equal.

## 2.3 Survived vs Not Survived

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(20, 8))
ax = ax.flatten()

for i, var in enumerate(num_var+cat_var):
    if i < 4:
        sns.histplot(data=df_train, x=var, hue=target, kde=True, ax=ax[i])
    else:
        sns.countplot(data=df_train, x=var, hue=target, ax=ax[i])
    
    ax[i].set_title(f"{var}: Survived vs Not Survived")
    
plt.subplots_adjust(hspace=0.5)
plt.show()

Apparently, passengers who travel with small numbers of family have a greater survival chance. Also, large number of children (Age <= 10) survived in this tragedy, while passengers with the age above them tend not to survive. We can also see the number of female who survived was much more than male (women and children first code). Passengers embarked from port C seem to be luckier as many of them survived. Pclass also plays quite a role here because most of the passengers from Pclass 1 managed to survive or have more priority to be saved, while passengers with Pclass 3 tend not to survive.

Now, let's get deeper.

In [None]:
sns.violinplot(data=df_train, x="Sex", y="Age", hue="Survived", split=True)
plt.show()

Children survival rate for both gender seems to be good, even though the survival rate for boys is higher. For old people (Age > 60), the survival rate for old men tends to be lower, in contrast to the survival rate for old women which tends to be higher.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

for i, var in enumerate(["SibSp", "Parch"]):
    surv = sns.barplot(data=df_train, x=var, y=target, ax=ax[i], ci=None)
    for bar in surv.patches:
        surv.annotate(format("{:.3f}".format(bar.get_height())),
            (bar.get_x() + bar.get_width() / 2,
            bar.get_height()), ha='center', va='center',
            size=11, xytext=(0, 8),
            textcoords='offset points')
        
    ax[i].set_title(f"{var} Survival Rate")

The graph above shows that passengers with small numbers of family tend to survive. Meanwhile, there are only ~34% of the lone passengers (SibSpb and Parch = 0) who survived. Unfortunately, 0% passengers with SibSp > 4 survived, and almost no passengers with Parch > 3 survived.

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 6))

for i, pc in enumerate(sorted(df_train["Pclass"].unique())):
    sns.histplot(data=df_train[df_train["Pclass"]==pc], x="Fare", hue=target, kde=True, ax=ax[i])
    ax[i].set_title(f"Fare in Pclass {pc} Survival Rate")

Passengers with higher fares are more likely to survive these accidents, although some of those with the highest fares on the Pclass 2 didn't survive. Even though most of the passengers who didn't survive were those with Pclass 3, but Pclass 1 and 2 also did not guarantee passenger safety, especially for those with less fares.

# 3. Feature Engineering

## 3.1 PassengerId

In [None]:
df_train["PassengerId"]

This variable only contains the unique identity number of each passenger and cannot be categorized, so we can just remove this variable.

In [None]:
df_train.drop("PassengerId", axis=1, inplace=True)
df_test.drop("PassengerId", axis=1, inplace=True)

## 3.2 Name

In [None]:
df_train["Name"]

As we can see, there is a pattern in the names of the passengers that we can extract, which is their title (Mr, Mrs, Miss, etc.). Let's extract the string value followed by dot (.).

In [None]:
df_train["Title"] = df_train["Name"].str.extract('([A-Za-z]+)\.')
df_test["Title"] = df_test["Name"].str.extract('([A-Za-z]+)\.')

df_train["Title"].value_counts()

Some initials are owned by very few people, and some initials have the same meaning as others. We will group them according to the proper group.

In [None]:
def convert_title(title):
    if title in ["Ms", "Mile", "Miss"]:
        return "Miss"
    elif title in ["Mme", "Mrs"]:
        return "Mrs"
    elif title == "Mr":
        return "Mr"
    elif title == "Master":
        return "Master"
    else:
        return "Other"
        
df_train["Title"] = df_train["Title"].map(convert_title)
df_test["Title"] = df_test["Title"].map(convert_title)

df_train["Title"].value_counts()

We've got the initials of each passenger, so now the variable name is no longer needed. We can remove this variable now.

In [None]:
df_train.drop("Name", axis=1, inplace=True)
df_test.drop("Name", axis=1, inplace=True)

## 3.3 Ticket

In [None]:
df_train["Ticket"]

This variable also only contains the unique ticket value for each passenger, and there is no pattern that we can extract. So, we'll just remove this variable.

In [None]:
df_train.drop("Ticket", axis=1, inplace=True)
df_test.drop("Ticket", axis=1, inplace=True)

## 3.4 Cabin

This variable has too many missing values in both train and test data (more than 77%), so this variable is missing a lot of information. We will also remove this variable.

In [None]:
df_train.drop("Cabin", axis=1, inplace=True)
df_test.drop("Cabin", axis=1, inplace=True)

## 3.5 Age

The age variable has a missing value of 19.87% in the training data and 20.57% in the test data. That number is not really much, so we will try to do data imputation on this variable. So, how are we going to do that?

Usually, we can just fill missing values using mean or median values. But the problem is, this dataset contains many passengers with many different ages. We just cant assign a 4 year kid or 60 years old man with the mean age that is 29 years. First, let's check the correlation between age variable with other variables.

In [None]:
df_train.corr()["Age"].sort_values(ascending=False)

Their correlation doesn't look so good. Now let's try to check a categorical variable that might be able to classify the age of the passengers, which is Title variable.

In [None]:
sns.violinplot(data=df_train, x="Title", y="Age")
plt.show()

That looks pretty good. We can use Title variable to classify the age of the passengers. Let's check passenger's average age based on their title.

In [None]:
df_train.groupby('Title')['Age'].mean()

Fill in the missing age value using the average passenger age for each title.

In [None]:
data = [df_train, df_test]
for df in data:
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Master'), 'Age'] = 5
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Miss'), 'Age'] = 22
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Mr'), 'Age'] = 32
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Mrs'), 'Age'] = 36
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Other'), 'Age'] = 44

## 3.6 Fare

We know that Fare variable in test data has missing value, so we will try to impute that variable value based on train data. Let's check the correlation Fare variable with other variables.

In [None]:
df_train.corr()["Fare"].sort_values(ascending=False)

As we can see, Pclass variable has a relatively strong negative correlation with Fare variable. So, we will use the mean value of Fare based on the Pclass variable to fill in the missing values. 

In [None]:
df_test[df_test["Fare"].isna()]

Passenger with missing fare has Pclass = 3.

In [None]:
df_test.Fare.fillna(df_train.groupby("Pclass").mean()["Fare"][3], inplace=True)

## 3.7 SibSp and Parch

We can create a new variable that shows the number of families accompanying their trip by adding up the values of the SibSp and Parch variables. And then from these variables, we can also create a variable that indicates whether the passenger is alone or not.

In [None]:
data = [df_train, df_test]
for df in data:
    df['Relatives'] = df['SibSp'] + df['Parch']
    df.loc[df['Relatives'] > 0, 'Alone'] = 1
    df.loc[df['Relatives'] == 0, 'Alone'] = 0

We can remove SIbSp and Parch variables since we don't need them anymore.

In [None]:
df_train.drop(["SibSp", "Parch"], axis=1, inplace=True)
df_test.drop(["SibSp", "Parch"], axis=1, inplace=True)

## 3.8 Categorical Variable Encoding

Some variables like Sex, Embarked, and Title are categorical, so we need to encode them first so that it can be used into machine learning models.

In [None]:
df_train = pd.get_dummies(df_train, prefix=["Sex", "Embarked", "Title"])
df_test = pd.get_dummies(df_test, prefix=["Sex", "Embarked", "Title"])

# 4. Model Building

## 4.1 Splitting Dataset

In [None]:
X_train = df_train.drop("Survived", axis=1)
y_train = df_train.Survived

X_test = df_test.copy()

## 4.1 Feature Scaling

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4.2 Building Classification Model

### 4.2.1 Choosing Best Algorithm

The next step is to choose the best algorithm that we will use to predict the test data. We will try to apply several algorithms to the training data using cross validation with a total of 10 folds. Apart from accuracy, we will also use the f1 score to evaluate the performance of the model because we have imbalance dataset.

In [None]:
classifiers = {
    "KNN": KNeighborsClassifier(), 
    "LR": LogisticRegression(max_iter=1000), 
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(),
    "MLP": MLPClassifier(max_iter=1000),
    "XGB": XGBClassifier(),
    "LGBM": LGBMClassifier()
}

results = pd.DataFrame(columns=["Classifier", "Avg_Accuracy", "Avg_F1_Score"])
for name, clf in classifiers.items():
    model = clf
    cv_results = cross_validate(
        model, X_train_scaled, y_train, cv=10,
        scoring=(['accuracy', 'f1'])
    )

    results = results.append({
        "Classifier": name,
        "Avg_Accuracy": cv_results['test_accuracy'].mean(),
        "Avg_F1_Score": cv_results['test_f1'].mean()
    }, ignore_index=True)
    
results["Avg_Overall"] = (results["Avg_Accuracy"] + results["Avg_F1_Score"]) / 2
results = results.sort_values("Avg_Overall", ascending=False)
results

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=results, x="Avg_Overall", y="Classifier")
plt.title("Average Overall CV Score")
plt.show()

Looks like Logistic Regression is our best model here. We will use this algorithm to predict the test data. But before that, we will adjust the hyperparameters on this algorithm using Grid Search Cross Validation.

### 4.2.2 Hyperparameter Tuning

In [None]:
lr = LogisticRegression()
params = {
    "penalty": ("l1", "l2", "elasticnet"),
    "tol": (0.1, 0.01, 0.001, 0.0001),
    "C": (10.0, 1.0, 0.1, 0.01)
}
clf = GridSearchCV(lr, params, cv=10)
clf.fit(X_train_scaled, y_train)
print("Best hyperparameter:", clf.best_params_)

In [None]:
y_pred = clf.predict(X_train_scaled)
print(f"Train Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"Train F1-Score: {f1_score(y_train, y_pred)}")
sns.heatmap(confusion_matrix(y_train, y_pred), fmt='.3g', annot=True, cmap='summer_r')
plt.show()

In [None]:
print(classification_report(y_train, y_pred))

### 4.2.3 Submit Test Prediction

In [None]:
y_pred = clf.predict(X_test_scaled)

submission = pd.read_csv("../input/titanic/gender_submission.csv")
submission["Survived"] = y_pred
submission.to_csv('submission.csv', index=False)