In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [13]:
df = pd.read_csv("titanic_5000_missing_age.csv")

print(df.head())
print(df.isnull().sum())

         Age  Pclass      Fare  Survived
0   5.939577       3   51.3938         0
1        NaN       3  391.5619         1
2  25.695244       3  495.7963         0
3   6.694663       1  267.9434         1
4   1.995994       3  342.9759         1
Age         699
Pclass        0
Fare          0
Survived      0
dtype: int64


In [14]:
# -------------------- TRAIN-TEST SPLIT --------------------
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [15]:
X_train.shape, X_test.shape

((4000, 3), (1000, 3))

In [17]:
# -------------------- FEATURE TYPES --------------------
numeric_features = ["Age", "Fare"]
categorical_features = ["Pclass"] 

In [18]:
# -------------------- NUMERIC PIPELINE --------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # Fill missing values
    ("scaler", StandardScaler())                     # Scale numeric values
])

In [19]:
# -------------------- CATEGORICAL PIPELINE --------------------
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [20]:
# -------------------- COLUMN TRANSFORMER --------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [21]:
# -------------------- FULL PIPELINE --------------------
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=200))
])


In [22]:
# -------------------- TRAIN MODEL --------------------
clf.fit(X_train, y_train)


# -------------------- PREDICT --------------------
y_pred = clf.predict(X_test)


In [23]:
# -------------------- EVALUATE --------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.525

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.61      0.55       474
           1       0.56      0.45      0.50       526

    accuracy                           0.53      1000
   macro avg       0.53      0.53      0.52      1000
weighted avg       0.53      0.53      0.52      1000



In [24]:
# -------------------- CHECK TRAINED COLUMNS --------------------
print("\nTransformed Feature Names:")
print(clf.named_steps["preprocessor"].get_feature_names_out())


Transformed Feature Names:
['num__Age' 'num__Fare' 'cat__Pclass_1' 'cat__Pclass_2' 'cat__Pclass_3']
