# Feature set ablation) – DS-- titanic: Create 3 feature sets (basic, engineered, engineered+scaling)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score


In [3]:
data = pd.read_csv("titanic.csv")

# Missing values
data['Age'] = data['Age'].fillna(data['Age'].mean())
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Encode categoricals
data['Sex'] = data['Sex'].map({'male':0, 'female':1})
data['Embarked'] = data['Embarked'].map({'S':0, 'C':1, 'Q':2})


In [4]:
X_basic = data[['Pclass','Sex','Age','Embarked']]
y = data['Survived']


In [5]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

data['AgeGroup'] = pd.cut(
    data['Age'],
    bins=[0,12,60,100],
    labels=[0,1,2]  # Child, Adult, Senior
)

X_engineered = data[
    ['Pclass','Sex','Age','Embarked','FamilySize','IsAlone','AgeGroup']
]


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_engineered)


In [7]:
def evaluate_model(X, y, model, scale=False):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return (
        accuracy_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    )


In [8]:
dt = DecisionTreeClassifier(max_depth=4, criterion='entropy')

print("DT – Basic:", evaluate_model(X_basic, y, dt))
print("DT – Engineered:", evaluate_model(X_engineered, y, dt))


DT – Basic: (0.7988826815642458, 0.7313432835820896)
DT – Engineered: (0.8100558659217877, 0.7424242424242424)


In [9]:
knn = KNeighborsClassifier(n_neighbors=5)

print("KNN – Engineered (no scaling):",
      evaluate_model(X_engineered, y, knn))

print("KNN – Engineered + Scaling:",
      evaluate_model(X_engineered, y, knn, scale=True))


KNN – Engineered (no scaling): (0.8212290502793296, 0.7714285714285715)
KNN – Engineered + Scaling: (0.7988826815642458, 0.7391304347826086)
