In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Load Data
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df  = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
# Define target & features
X = train_df.drop(columns=["Survived"])
y = train_df["Survived"]

In [4]:
# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y # to preserve the original class distribution, since the Survival classes are imbalanced
)

In [5]:
#Pre-processing
#Identify column types
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

In [6]:
# Apply imputation & StandardScaler
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # to fix the Convergence Warning
])

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

In [7]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [8]:
# Logistic regression model
model = LogisticRegression(max_iter=1000) #sets a hard limit on the number of optimization steps

# Full pipeline
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

In [9]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# Run Cross-Validation
scores = cross_val_score(
    clf,
    X,
    y,
    cv=cv,
    scoring="accuracy"
)

In [10]:
scores.mean() # True baseline score

0.8069612704789403

In [11]:
scores.std()

0.0332360185803288

## Cross-Validation Results

- Number of folds: 5
- Metric: Accuracy
- Mean CV accuracy: 0.807
- Standard deviation: 0.0332

Observations:
- CV score vs single split comparison
      With 5 StratifiedKFolds, each fold represents 20% of the data. A standard deviation of 3.32% across these folds is considered good. It means this model isn't overly dependent on a specific 20% of your data to perform well.
- Variance indicates stability of the model
      The model generalizes well and is not overly sensitive to data noise


## Leakage Check

- Are any features derived from the target? (No)
    This happens when a feature contains data that only becomes known after the target event has already occurred.
- Is preprocessing inside the pipeline? (Yes)
- Is validation performed outside the training data? (Yes)
