
# ðŸš¢ Titanic Survival Prediction â€“ Full Machine Learning Pipeline

This notebook includes:
- Advanced Feature Engineering
- Multiple Models (Logistic Regression, Random Forest, Gradient Boosting)
- Cross-Validation
- ROC-AUC Evaluation
- Feature Importance
- Kaggle Submission Generation

Goal: End-to-end, portfolio-ready ML project.


In [None]:

# ===============================
# Imports
# ===============================
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, RocCurveDisplay

sns.set(style="whitegrid")


## 1. Load Dataset

In [None]:

df = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")
df.head()


## 2. Initial Cleaning

In [None]:

df.drop(columns=['Cabin', 'Ticket'], inplace=True)


## 3. Advanced Feature Engineering

In [None]:

# Extract title from name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(
    ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],
    'Rare'
)
df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# Family features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

df.drop(columns=['Name'], inplace=True)
df.head()


## 4. Define Features & Target

In [None]:

X = df.drop('Survived', axis=1)
y = df['Survived']


## 5. Preprocessing

In [None]:

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
categorical_features = ['Sex', 'Embarked', 'Pclass', 'Title']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])


## 6. Train/Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## 7. Models Definition

In [None]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


## 8. Cross-Validation (ROC-AUC)

In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in models.items():
    pipe = Pipeline([
        ('preprocessing', preprocessor),
        ('model', clf)
    ])
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='roc_auc')
    print(f"{name}: ROC-AUC = {scores.mean():.3f} Â± {scores.std():.3f}")


## 9. Train Best Model (Random Forest)

In [None]:

best_model = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(n_estimators=300, random_state=42))
])

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:,1]


## 10. Evaluation

In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))


## 11. Confusion Matrix

In [None]:

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.show()


## 12. ROC Curve

In [None]:

RocCurveDisplay.from_predictions(y_test, y_prob)
plt.show()


## 13. Kaggle Submission File

In [None]:

# Train on full data
best_model.fit(X, y)

test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

test_df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True, errors='ignore')

predictions = best_model.predict(test_df)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": predictions
})

submission.to_csv("submission.csv", index=False)
submission.head()



## âœ… Final Notes

This project demonstrates:
- Real feature engineering
- Model comparison with CV
- Robust evaluation (ROC-AUC)
- Production-ready pipeline
- Kaggle-ready submission

This is **portfolio-grade** work.
