#  Titanic Survival Prediction Using Machine Learning 

# Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Loading Data

In [None]:
train_data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# EDA

In [None]:
train_data.info()

In [None]:
df_num = train_data[["Age", "SibSp", "Parch", "Fare"]]
df_cat = train_data[["Survived", "Sex", "Cabin", "Embarked", "Ticket"]]

In [None]:
for i in df_num.columns:
    plt.hist(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
sns.barplot(data=train_data, x="Pclass", y="Fare", hue="Survived")

In [None]:
pd.pivot_table(train_data, index="Survived", values=["Age", "SibSp", "Parch", "Fare"])

In [None]:
for i in df_cat.columns:
    sns.barplot(x=df_cat[i].value_counts().index, y=df_cat[i].value_counts())
    plt.show()

In [None]:
x = pd.DataFrame(
    (
        pd.pivot_table(
            train_data,
            index="Survived",
            columns="Sex",
            values="Ticket",
            aggfunc="count",
        )
    )
)
print()
print(
    pd.pivot_table(
        train_data, index="Survived", columns="Pclass", values="Ticket", aggfunc="count"
    )
)
print()
print(
    pd.pivot_table(
        train_data,
        index="Survived",
        columns="Embarked",
        values="Ticket",
        aggfunc="count",
    )
)
print()
x

# Data Cleaning  

In [None]:
train_data.isnull().sum()

In [None]:
train_data = train_data.drop(columns=["PassengerId", "Cabin", "Name", "Ticket"])

In [None]:
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mean())

In [None]:
train_data["Embarked"] = train_data["Embarked"].fillna(train_data["Embarked"].mode()[0])

In [None]:
train_data.isnull().sum()

# Feature Enginneering 

In [None]:
train_data["Fare"] = np.log(train_data["Fare"] + 1)

In [None]:
sns.displot(train_data["Fare"], kde=True)

In [None]:
corr = train_data.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
from sklearn.preprocessing import LabelEncoder

cols = ["Sex", "Embarked"]
le = LabelEncoder()
for col in cols:
    train_data[col] = le.fit_transform(train_data[col])

train_data.head()

In [None]:
X = train_data.drop(columns=["Survived"], axis=1)
y = train_data["Survived"]
train_data

# Setting up the Parameters for the Model  

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score


def classify(model):
    x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=40
    )
    model.fit(x_train, y_train)
    print("Accuracy", model.score(x_test, y_test))

    score = cross_val_score(model, X, y, cv=5)
    print("CV SCORE :", np.mean(score))

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
classify(model)

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()
classify(model)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
classify(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
classify(model)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
classify(model)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
classify(model)

In [None]:
model = XGBClassifier()
model.fit(X, y)

In [None]:
X_test = test.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"], axis=1)

X_test

In [None]:
from sklearn.preprocessing import LabelEncoder

cols = ["Sex", "Embarked"]
le = LabelEncoder()

for col in cols:
    X_test[col] = le.fit_transform(X_test[col])

X_test.head()

In [None]:
X_test["Age"] = X_test["Age"].fillna(X_test["Age"].mean())
X_test["Fare"] = X_test["Fare"].fillna(X_test["Fare"].mean())

X_test.isnull().sum()

# Data Preprocessing

In [None]:
X_test = test.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"], axis=1)

X_test["Age"] = X_test["Age"].fillna(X_test["Age"].mean())
X_test["Fare"] = X_test["Fare"].fillna(X_test["Fare"].mean())

X_test.isnull().sum()

from sklearn.preprocessing import LabelEncoder

cols = ["Sex", "Embarked"]
le = LabelEncoder()

for col in cols:
    X_test[col] = le.fit_transform(X_test[col])

X_test.head()
X_test

# Model Testing 

In [None]:
pred = model.predict(X_test)
pred

# Test Submission 

In [None]:
submit = pd.read_csv("data/gender_submission.csv")
submit

In [None]:
submit["Survived"] = pred

submit

In [None]:
submit.info()

In [None]:
submit.to_csv("Submission.csv", index=False)

# The Accuracy of the Model is 73.8%