Titanic - Machine Learning from Disaster

In [182]:
import numpy as np
import pandas as pd

In [183]:
#df : dataframe.
df_train = pd.read_csv("./Titanic_Project_Information_Data/train.csv")
df_test = pd.read_csv("./Titanic_Project_Information_Data/test.csv")

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
df_train = pd.read_csv("./Titanic_Project_Information_Data/train.csv", index_col="PassengerId")
df_train.head()
df_train.tail()

In [187]:
conv_train_features = ["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]
def convert_train_categorical(df, feature):
    for feature in conv_train_features:
        df[feature] = df[feature].astype("category")

convert_train_categorical(df_train, conv_train_features)

In [None]:
df_test = pd.read_csv("./Titanic_Project_Information_Data/test.csv", index_col="PassengerId")
df_test.head()
df_test.tail()

In [189]:
conv_test_features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
def convert_test_categorical(df, feature):
    for feature in conv_test_features:
        df[feature] = df[feature].astype("category")

convert_test_categorical(df_test, conv_test_features)

In [None]:
df_train.describe()
df_train.describe(include="category")

Exploratory Data Analysis - EDA for Categorical : Survived, Sex, Embarked, Pclass (ordinal), SibSp, Parch

In [None]:
df_train["Survived"].value_counts().to_frame()
df_train["Survived"].value_counts(normalize=True).to_frame()

In [None]:
df_train["Sex"].value_counts().to_frame()
df_train["Sex"].value_counts(normalize=True).to_frame()

In [193]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
colums = ["Sex", "Pclass", "SibSp", "Parch", "Embarked"]
rows = 2
cols = 3

fig, axs = plt.subplots(rows, cols, figsize=(cols*3.5, rows*3.5))
for r_row in range(rows):
    for c_col in range(cols):
        i = r_row * cols + c_col
        if i < len(colums):
            ax_i = axs[r_row, c_col]
            sns.countplot(data=df_train, x=colums[i], hue="Survived", ax=ax_i)
            ax_i.set_title(f"Survival rate by {colums[i]}")
            ax_i.legend(title="", loc="upper right", labels=["No", "Yes"])

axs.flat[-1].set_visible(False) # Hide the last subplot.
plt.tight_layout()              # Adjust the layout.
plt.show()   

Exploratory Data Analysis - EDA for Numerical : (continuous) Age, Fare

In [None]:
sns.histplot(data=df_train, x="Age", bins=40, hue="Survived", kde=True)

In [None]:
sns.histplot(data=df_train, x="Fare", bins=40, hue="Survived", kde=True)

In [None]:
categories_fare = ["Cheap", "Normal", "Expensive", "Luxury"]
pd.qcut(df_train["Fare"], q=4, labels=categories_fare)

sns.countplot(x=pd.qcut(df_train["Fare"], q=4, labels=categories_fare), hue="Survived", data=df_train)

Feature Engineering & Data Wrangling

Feature Engineering - PassengerID(Name)


In [None]:
import re
df_train["Name"].tail(10)

def title_extract(name):
    title = re.compile(r",([\w\s]+)\.") # Extract the title from the name.
    return title.search(name).group(1).strip()

df_train["Title"]=df_train["Name"].apply(lambda name: title_extract(name))
df_train["Title"].value_counts()

In [None]:
df_test["Title"]=df_test["Name"].apply(lambda name: title_extract(name))
df_test["Title"].value_counts()

In [None]:
# group các title lại với nhau để giảm số lượng title.
def title_group(title):
    if title in ["Mr", "Miss", "Mrs", "Master"]:
        return title
    elif title == "Ms":
        return "Miss"
    else:
        return "Others"

df_train["Title"]=df_train["Title"].apply(lambda title: title_group(title))
df_test["Title"]=df_test["Title"].apply(lambda title: title_group(title))

df_train["Title"].value_counts()
df_test["Title"].value_counts()

sns.countplot(data=df_train, x="Title", hue="Survived")

Feature Engineering - Family(SibSp, Parch)

In [None]:
df_train["FamilySize"] = df_train["SibSp"].astype(int) + df_train["Parch"].astype(int) + 1
df_test["FamilySize"] = df_test["SibSp"].astype(int) + df_test["Parch"].astype(int) + 1

df_train["FamilyCate"]=pd.cut(df_train["FamilySize"], bins=[0, 1, 4, 6, 20], labels=["Single", "Small", "Medium", "Large"])
df_test["FamilyCate"]=pd.cut(df_test["FamilySize"], bins=[0, 1, 4, 6, 20], labels=["Single", "Small", "Medium", "Large"])
sns.countplot(data=df_train, x="FamilyCate", hue="Survived")

Data Wrangling - Data Cleaning
- Feature Numerical : Age, Fare
- Feature Categorical : Sex, Pclass, Embarked, [Title(Name), FamilyCate(SibSp, Parch)] - Created in Feature Engineering

In [None]:
numerical_features = ["Age", "Fare"]
categorical_features = ["Sex", "Pclass", "Embarked", "Title", "FamilyCate"]
columns_features = numerical_features + categorical_features
print(columns_features)

In [None]:
def check_missing(df, columns_features):
    for column in columns_features:
        count_missing = df[column].isnull().sum()
        if count_missing > 0:
            print(f"{column}: {count_missing} missing value(s) - {count_missing/len(df)*100:.2f}%")
          
check_missing(df_train, columns_features)
check_missing(df_test, columns_features)

Tìm sự tương đồng giữa Age và cột dữ liệu khác, filling missing values  với median groupby

In [None]:
age_by_sex_Pclass = df_train.groupby(["Sex", "Pclass"])["Age"].median().to_frame()
age_by_sex_Pclass

In [205]:
df_train["Age"]=df_train.groupby(["Sex", "Pclass"], observed=True)["Age"].transform(lambda x: x.fillna(x.median())).to_frame()
df_test["Age"]=df_test.groupby(["Sex", "Pclass"], observed=True)["Age"].transform(lambda x: x.fillna(x.median())).to_frame()

In [None]:
check_missing(df_train, columns_features)
check_missing(df_test, columns_features)

Tách Dataframe_Train → X : Feature Columns & Y : Survived Column

In [207]:
X_train = df_train[columns_features]
Y_train = df_train["Survived"]

In [208]:
X_test = df_test[columns_features]

Embarked, Fare - Preprocess Pipeline

In [209]:
# OneHotEncoder: encode categorical features as a one-hot numeric array.
# StandardScaler: standardize features by removing the mean and scaling to unit variance.
# SimpleImputer: impute missing values.
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Pipeline: fill missing values, standardize features, and encode categorical features.
from sklearn.pipeline import Pipeline

numer_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cate_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [210]:
# ColumnTransformer: apply different transformers to different columns.
from sklearn.compose import ColumnTransformer

# define the preprocessor for numerical and categorical features.
preprocessor = ColumnTransformer(transformers=[("numer", numer_transformer, numerical_features),
                                               ("cate", cate_transformer, categorical_features)])

In [None]:
# Fit the preprocessor to the training data with X_train.
preprocessor.fit(X_train)

In [212]:
# Transform the training data.
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

Model Training

In [213]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation data.
X_for_train, X_for_val, Y_for_train, Y_for_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

In [None]:
X_for_train.shape, X_for_val.shape, X_test.shape

Model 1 : Binary Classification - Logistic Regression

In [215]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report

In [None]:
logistic_reg=LogisticRegression(solver="liblinear", max_iter=1000)
logistic_reg.fit(X_for_train, Y_for_train)
logistic_reg.score(X_for_train, Y_for_train)

In [None]:
# Predict the validation data and evaluate the model.
y_pred = logistic_reg.predict(X_for_val)
precision_score(Y_for_val, y_pred), recall_score(Y_for_val, y_pred), f1_score(Y_for_val, y_pred), accuracy_score(Y_for_val, y_pred)
print(classification_report(Y_for_val, y_pred))

Model 2 : Binary Classification - Polynomial Features

In [218]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2) #degree=2: tạo ra các feature bậc 2.
poly_features_Xtrain = poly.fit_transform(X_train)
poly_features_Xval = poly.fit_transform(X_for_val)

In [None]:
poly_log_reg = LogisticRegression(solver="liblinear", max_iter=1000)
poly_log_reg.fit(poly_features_Xtrain, Y_train)

In [None]:
poly_log_reg.score(poly_features_Xtrain, Y_train)

In [None]:
# Predict the validation data and evaluate the model.
y_pred = poly_log_reg.predict(poly_features_Xval)
precision_score(Y_for_val, y_pred), recall_score(Y_for_val, y_pred), f1_score(Y_for_val, y_pred), accuracy_score(Y_for_val, y_pred)
print(classification_report(Y_for_val, y_pred))

Model 3 : Binary Classification - Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=0)
decision_tree.fit(X_for_train, Y_for_train)
decision_tree.score(X_for_train, Y_for_train)

In [None]:
# Predict the validation data and evaluate the model.
y_pred = decision_tree.predict(X_for_val)
precision_score(Y_for_val, y_pred), recall_score(Y_for_val, y_pred), f1_score(Y_for_val, y_pred), accuracy_score(Y_for_val, y_pred)
print(classification_report(Y_for_val, y_pred))

Model 4 : Cross Validation k-Fold

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation: evaluate the model.
logistic_reg_cross_val = LogisticRegression(solver="liblinear", max_iter=1000)
decision_tree_cross_val = DecisionTreeClassifier(criterion="entropy", max_depth=8, random_state=0)

logistic_reg_scores = cross_val_score(logistic_reg_cross_val, X_train, Y_train, cv=5, scoring="accuracy")
logistic_reg_scores.mean(), logistic_reg_scores.std()

In [None]:
decision_tree_scores = cross_val_score(decision_tree_cross_val, X_train, Y_train, cv=5, scoring="accuracy")
decision_tree_scores.mean(), decision_tree_scores.std()