# Important Notes

There will be no EDA or previously used approaches present here.

I am only using scikit-learn and pandas. If needed one can install them by running the cell below:

In [1]:
%pip install scikit-learn
%pip install "pandas<3.0"


Note: you may need to restart the kernel to use updated packages.


# Imports and loading the data

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

import random

from pathlib import Path

random.seed(58)
np.random.seed(58)

In [3]:
data_path = Path.cwd()

train_df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")


data_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# Feature Engineering

## Sex

In [4]:
import re

gender_mapping = {
    "Mr": "male",
    "Mrs": "female",
    "Miss": "female",
    "Mme": "female",
    "Ms": "female",
    "Lady": "female",
    "Mlle": "female",
    "Dona": "female",
    "Col": "male",
    "Capt": "male",
    "Jonkheer": "male",
    "Master": "male",
    "Don": "male",
    "Rev": "male",
    "Dr": "male",
    "Sir": "male",
    "Major": "male",
}


def get_gender(title):
    if title in gender_mapping:
        return gender_mapping[title]
    else:
        return "male"


def extract_title_regex(name):
    pattern = r",\s*([A-Za-z]+)\.\s*"
    match = re.search(pattern, name)
    if match:
        return match.group(1)
    else:
        return "Missing"  # Handle cases with no title


def transform_sex(X):
    X["Title"] = X["Name"].apply(extract_title_regex)
    # print(X["Title"].value_counts())
    X["Sex"] = X["Title"].apply(get_gender)
    X.drop(["Title"], axis=1, inplace=True)
    return X


data_df = transform_sex(data_df)

## Age (Imputing and Bins)

In [5]:
data_df["Title"] = data_df["Name"]
# Cleaning name and extracting Title
for name_string in data_df["Name"]:
    data_df["Title"] = data_df["Name"].str.extract("([A-Za-z]+)\.", expand=True)

# Replacing rare titles with more common ones
mapping = {
    "Mlle": "Miss",
    "Major": "Mr",
    "Col": "Mr",
    "Sir": "Mr",
    "Don": "Mr",
    "Mme": "Miss",
    "Jonkheer": "Mr",
    "Lady": "Mrs",
    "Capt": "Mr",
    "Countess": "Mrs",
    "Ms": "Miss",
    "Dona": "Mrs",
}

data_df.replace({"Title": mapping}, inplace=True)

titles = data_df["Title"].unique()

for title in titles:
    age_to_impute = data_df.groupby("Title")["Age"].median()[title]
    data_df.loc[(data_df["Age"].isnull()) & (data_df["Title"] == title), "Age"] = (
        age_to_impute
    )

In [6]:
data_df["AgeBin"] = pd.qcut(data_df["Age"], 4)

label = LabelEncoder()
data_df["AgeBin_Code"] = label.fit_transform(data_df["AgeBin"])

## Family Size

In [7]:
data_df["Family_Size"] = data_df["Parch"] + data_df["SibSp"]

## Family Survival

I extract last names from a full name and handle missing fare values by filling them with the average fare. Then, I assign a "Family_Survival" score to each passenger based on the survival status of others with the same last name and fare (indicating family groups) or the same ticket number (indicating traveling groups). The score is set to 1 if any family or group member survived, 0 if all did not survive, and remains 0.5 if there is no conclusive information from the group.

In [8]:
def update_family_survival(row, group):
    """Update the family survival status based on the group data."""
    if (row["Family_Survival"] == 0) | (row["Family_Survival"] == 0.5):
        smax = group.drop(row.name)["Survived"].max()
        smin = group.drop(row.name)["Survived"].min()
        if smax == 1.0:
            return 1
        elif smin == 0.0:
            return 0
    return row["Family_Survival"]


# Initial setup
data_df["Last_Name"] = data_df["Name"].apply(lambda x: x.split(",")[0])
data_df["Fare"].fillna(data_df["Fare"].mean(), inplace=True)
data_df["Family_Survival"] = 0.5

# Update Family_Survival based on family groups (Last_Name and Fare)
for _, grp_df in data_df.groupby(["Last_Name", "Fare"]):
    if len(grp_df) > 1:
        data_df.loc[grp_df.index, "Family_Survival"] = grp_df.apply(
            update_family_survival, axis=1, args=(grp_df,)
        )

# Update Family_Survival based on ticket groups
for _, grp_df in data_df.groupby("Ticket"):
    if len(grp_df) > 1:
        data_df.loc[grp_df.index, "Family_Survival"] = grp_df.apply(
            update_family_survival, axis=1, args=(grp_df,)
        )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df["Fare"].fillna(data_df["Fare"].mean(), inplace=True)


## Fare

In [9]:
data_df["Fare"].fillna(data_df["Fare"].median(), inplace=True)

# Making Bins
data_df["FareBin"] = pd.qcut(data_df["Fare"], 5)

label = LabelEncoder()
data_df["FareBin_Code"] = label.fit_transform(data_df["FareBin"])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df["Fare"].fillna(data_df["Fare"].median(), inplace=True)


# Preparing Data for Training and Prediction

In [10]:
data_df["Sex"].replace(["male", "female"], [0, 1], inplace=True)

data_df.drop(
    columns=[
        "Name",
        "PassengerId",
        "SibSp",
        "Parch",
        "Ticket",
        "Cabin",
        "Embarked",
        "Age",
        "Fare",
        "Last_Name",
        "FareBin",
        "AgeBin",
        "Title",
    ],
    inplace=True,
)

train_df = data_df[:891]
test_df = data_df[891:]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df["Sex"].replace(["male", "female"], [0, 1], inplace=True)
  data_df["Sex"].replace(["male", "female"], [0, 1], inplace=True)


In [11]:
X = train_df.drop(columns=["Survived"])
y = train_df["Survived"]
X_test = test_df.copy().drop(columns=["Survived"])

In [12]:
X.columns

Index(['Pclass', 'Sex', 'AgeBin_Code', 'Family_Size', 'Family_Survival',
       'FareBin_Code'],
      dtype='object')

In [13]:
std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
X_test = std_scaler.transform(X_test)

# Training and Hyperparameter Tuning

In [14]:
n_neighbors = [6, 7, 8, 9, 10, 11, 12, 14, 16, 18, 20, 22]
algorithm = ["auto"]
weights = ["uniform", "distance"]
leaf_size = list(range(1, 50, 5))

hyperparams = {
    "algorithm": algorithm,
    "weights": weights,
    "leaf_size": leaf_size,
    "n_neighbors": n_neighbors,
}

gd = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=hyperparams,
    verbose=True,
    cv=10,
    scoring="roc_auc",
)

gd.fit(X, y)
print(gd.best_score_)
print(gd.best_estimator_)
print(gd.best_params_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
0.8772168180403475
KNeighborsClassifier(leaf_size=26, n_neighbors=16)
{'algorithm': 'auto', 'leaf_size': 26, 'n_neighbors': 16, 'weights': 'uniform'}


In [15]:
gd.best_estimator_.fit(X, y)
y_pred = gd.best_estimator_.predict(X_test)

**Disclaimer:**

I chose `n_neighbours=6` because I achieved a higher score on the scoreboard with that. That is most likely due to the fact, that we are overfitting on the train data in the hyperparameter tuning.

In [16]:
params = gd.best_params_
params["n_neighbors"] = 6

In [17]:
knn = KNeighborsClassifier(
    algorithm=params["algorithm"],
    leaf_size=params["leaf_size"],
    n_neighbors=params["n_neighbors"],
    weights=params["weights"],
)


knn.fit(X, y)

y_pred = knn.predict(X_test)

# Creating Submission File

In [18]:
submission = pd.DataFrame(pd.read_csv(data_path / "test.csv")["PassengerId"])
submission["Survived"] = y_pred.astype(int)
submission.to_csv(data_path / "submission_test.csv", index=False)