# Titanic Survival Prediction

End-to-end Machine Learning pipeline including EDA, Feature Engineering, Model Training, and Evaluation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

print(train_df.shape)
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
sns.countplot(x="Survived", data=train_df)
plt.show()

In [None]:
sns.countplot(x="Sex", hue="Survived", data=train_df)
plt.show()

In [None]:
sns.countplot(x="Pclass", hue="Survived", data=train_df)
plt.show()

In [None]:
train_df["Age"].fillna(train_df["Age"].median(), inplace=True)
test_df["Age"].fillna(test_df["Age"].median(), inplace=True)

train_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)

train_df.drop("Cabin", axis=1, inplace=True)
test_df.drop("Cabin", axis=1, inplace=True)

In [None]:
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"] + 1

train_df["IsAlone"] = 0
train_df.loc[train_df["FamilySize"] == 1, "IsAlone"] = 1

test_df["IsAlone"] = 0
test_df.loc[test_df["FamilySize"] == 1, "IsAlone"] = 1

In [None]:
train_df["Title"] = train_df["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)
test_df["Title"] = test_df["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)

for df in [train_df, test_df]:
    df["Title"] = df["Title"].replace(
        ["Lady", "Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],
        "Rare"
    )
    df["Title"] = df["Title"].replace("Mlle", "Miss")
    df["Title"] = df["Title"].replace("Ms", "Miss")
    df["Title"] = df["Title"].replace("Mme", "Mrs")

In [None]:
drop_cols = ["PassengerId", "Name", "Ticket"]
train_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

In [None]:
label_cols = ["Sex", "Embarked", "Title"]

le = LabelEncoder()
for col in label_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [None]:
X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))