# Titanic Baseline EDA + Submission

Goal: complete the full Kaggle Titanic loop (load data -> baseline model -> cross-validation -> generate submission file).


## 1. Environment and Paths


In [None]:
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
DATA_RAW = PROJECT_ROOT / 'data' / 'raw'
SUBMISSIONS = PROJECT_ROOT / 'submissions'

DATA_RAW, SUBMISSIONS


## 2. Load Data


In [None]:
import pandas as pd

train = pd.read_csv(DATA_RAW / 'train.csv')
test = pd.read_csv(DATA_RAW / 'test.csv')

train.head()


## 3. Basic Overview


In [None]:
train.info()


In [None]:
train.isna().sum().sort_values(ascending=False)


## 4. Minimal Baseline (Gender Rule)


In [None]:
submission = test[['PassengerId']].copy()
submission['Survived'] = (test['Sex'] == 'female').astype(int)

SUBMISSIONS.mkdir(parents=True, exist_ok=True)
output_path = SUBMISSIONS / 'submission_gender.csv'
submission.to_csv(output_path, index=False)
output_path


## 5. Pipeline Baseline (Logistic Regression + Cross-Validation)


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train[features]
y = train['Survived']
X_test = test[features]

preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), ['Age', 'SibSp', 'Parch', 'Fare']),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['Pclass', 'Sex', 'Embarked']),
    ]
)

clf = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')

{'cv_mean': scores.mean(), 'cv_std': scores.std()}


In [None]:
clf.fit(X, y)
pred = clf.predict(X_test)

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred})
output_path = SUBMISSIONS / 'submission_lr.csv'
submission.to_csv(output_path, index=False)
output_path


## 6. Next Steps
- Incrementally add features such as Title, FamilySize, and Deck
- Use the same preprocessing and CV for comparisons
- Submit only when CV shows a stable improvement


## 7. Feature Engineering (Title / FamilySize / Deck)


In [None]:
import re

train_fe = train.copy()
test_fe = test.copy()

# Title from Name
for df in [train_fe, test_fe]:
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# FamilySize + IsAlone
for df in [train_fe, test_fe]:
    df['FamilySize'] = df['SibSp'].fillna(0) + df['Parch'].fillna(0) + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Deck from Cabin
for df in [train_fe, test_fe]:
    df['Deck'] = df['Cabin'].str.slice(0, 1).fillna('U')

train_fe[['Title','FamilySize','IsAlone','Deck']].head()


### Retrain with Extended Features


In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Deck']
X = train_fe[features]
y = train_fe['Survived']
X_test = test_fe[features]

preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck']),
    ]
)

clf = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
{'cv_mean': scores.mean(), 'cv_std': scores.std()}


In [None]:
clf.fit(X, y)
pred = clf.predict(X_test)

submission = pd.DataFrame({'PassengerId': test_fe['PassengerId'], 'Survived': pred})
output_path = SUBMISSIONS / 'submission_lr_feature_engineered.csv'
submission.to_csv(output_path, index=False)
output_path
