In [None]:
# Environment checks
import os

data_path = '../../data/processed/cleaned_titanic.csv'
model_path = '../../backend/model.pkl'

print('Data exists:', os.path.exists(data_path), '->', data_path)
print('Model exists:', os.path.exists(model_path), '->', model_path)

if not os.path.exists(data_path):
    print('\nCleaned data not found. Run: python scripts/prepare_data.py')

if not os.path.exists(model_path):
    print('\nModel not found. Run: python scripts/train_model.py')


# Titanic Survival Prediction Model

This notebook implements a machine learning pipeline for predicting passenger survival on the Titanic.

## Model training and evaluation plan

In this notebook we will:

1. Load the cleaned dataset produced by `01_data_cleaning.ipynb`.
2. Create derived features (e.g., family size, titles, fare per person) that capture relevant passenger information.
3. Train a Logistic Regression pipeline (preprocessing + classifier) as the primary model for its simplicity and interpretability.
4. Evaluate the model using accuracy, confusion matrix, and ROC curve.
5. Save the final pipeline to `backend/model.pkl` for use by the API consumer.

Rationale: Logistic Regression provides a transparent baseline and is suitable for binary classification when features are preprocessed appropriately.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, classification_report, 
                           confusion_matrix, roc_auc_score, roc_curve, auc)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib
import os
import json

# Set random seed for reproducibility
np.random.seed(42)

# Set plot style
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Load and prepare data
try:
    # Load the cleaned data
    df = pd.read_csv('../../data/processed/cleaned_titanic.csv')
    print("Data loaded successfully!")
    print(f"Shape of the dataset: {df.shape}")
    print(\"\nFirst few rows of the dataset:")
    display(df.head())
    
    # Check for missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())
    
    # Basic statistics
    print("\nDataset statistics:")
    display(df.describe())
    
    # Check data types
    print("\nData types:")
    print(df.dtypes)
    
except Exception as e:
    print(f"Error loading data: {e}")

In [None]:
# Feature Engineering and Preprocessing

# Use shared create_features utility from backend so notebooks and backend share logic
from backend.utils import create_features

# Apply feature engineering
df = create_features(df.copy())

# Display new features
print("New features created:")
display(df[['FamilySize', 'IsAlone', 'Title', 'AgeGroup', 'FarePerPerson']].head())

In [None]:
# Data Preprocessing

# Define features and target
target = 'Survived'

# Select features for modeling
features = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'Embarked', 'FamilySize', 'IsAlone', 'Title', 'AgeGroup', 'FarePerPerson'
]

# Separate features and target
X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Define preprocessing for numerical features
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'AgeGroup', 'IsAlone']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Save the feature names for later use
feature_names = numeric_features.copy()
for col in categorical_features:
    if col in X_train.columns:
        feature_names.extend([f"{col}_{val}" for val in sorted(X_train[col].dropna().unique())])

# Save feature names
os.makedirs('../../models', exist_ok=True)
with open('../../models/feature_names.json', 'w') as f:
    json.dump(feature_names, f)

print("\nPreprocessing pipeline created.")

In [None]:
# Model Training and Evaluation

# Define primary model (Logistic Regression) and a short ensemble of candidates for comparison
primary_model = LogisticRegression(max_iter=1000, random_state=42)
other_models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train the primary pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', primary_model)
])

print("Training Logistic Regression pipeline...")
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Logistic Regression Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Optionally train other models for comparison (not saved as primary)
results = {
    'Logistic Regression': {'model': pipeline, 'accuracy': accuracy, 'roc_auc': roc_auc}
}
for name, model in other_models.items():
    tmp = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    tmp.fit(X_train, y_train)
    y_pred_tmp = tmp.predict(X_test)
    y_proba_tmp = tmp.predict_proba(X_test)[:, 1]
    results[name] = {
        'model': tmp,
        'accuracy': accuracy_score(y_test, y_pred_tmp),
        'roc_auc': roc_auc_score(y_test, y_proba_tmp)
    }
    print(f"{name}: Accuracy = {results[name]['accuracy']:.4f}, ROC AUC = {results[name]['roc_auc']:.4f}")

In [None]:
# Model Comparison and Visualization

# Compare model performance
model_comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[model]['accuracy'] for model in results],
    'ROC AUC': [results[model]['roc_auc'] for model in results]
}).sort_values('ROC AUC', ascending=False)

print("\nModel Performance Comparison:")
display(model_comparison)

# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['model'].predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Feature importance for the best model (Random Forest)
best_model_name = model_comparison.iloc[0]['Model']
best_model = results[best_model_name]['model']

if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    # For tree-based models
    importances = best_model.named_steps['classifier'].feature_importances_
    
    # Get feature names after one-hot encoding
    try:
        # For one-hot encoded features
        ohe_columns = list(best_model.named_steps['preprocessor']
                          .named_transformers_['cat']
                          .named_steps['onehot']
                          .get_feature_names_out(input_features=categorical_features))
    except:
        ohe_columns = []
    
    all_feature_names = numeric_features + ohe_columns
    
    # Create feature importance DataFrame
    feature_importance = pd.DataFrame({
        'Feature': all_feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title(f'Feature Importance - {best_model_name}')
    plt.tight_layout()
    plt.show()
    
    # Display top 10 features
    print("\nTop 10 Most Important Features:")
    display(feature_importance.head(10))

In [None]:
# Save the final Logistic Regression pipeline and metadata
import json

# Save the pipeline to the backend folder so consumer can load it
os.makedirs('../../backend', exist_ok=True)
model_path_backend = os.path.join('..', '..', 'backend', 'model.pkl')
joblib.dump(pipeline, model_path_backend)
print(f"Model pipeline saved to {model_path_backend}")

# Save feature info (which raw fields are required in the API payload)
feature_info = {
    'required_raw_features': ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name'],
    'training_features': features
}
feature_info_path = os.path.join('..', '..', 'backend', 'feature_info.json')
with open(feature_info_path, 'w') as f:
    json.dump(feature_info, f, indent=2)
print(f"Feature info saved to {feature_info_path}")

# Save a copy to models/ for record keeping
os.makedirs('../../models', exist_ok=True)
joblib.dump(pipeline, '../../models/titanic_survival_model.pkl')

# Final evaluation on test set
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("\nFinal Model Evaluation:")
print("=" * 50)
print(f"Model: Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Survived', 'Survived'],
            yticklabels=['Not Survived', 'Survived'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("\nModel training and evaluation complete!")

In [None]:
# Model Training for Titanic Survival Prediction

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

# Create directories if they don't exist
os.makedirs('../../models', exist_ok=True)

# Load the cleaned data
df = pd.read_csv('../../data/processed/cleaned_titanic.csv')

# Prepare features and target
X = df.drop(['Survived', 'Name', 'Ticket'], axis=1, errors='ignore')
y = df['Survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train the model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Save the model
model_path = '../../models/titanic_survival_model.pkl'
joblib.dump(model, model_path)
print(f"\nModel saved to {model_path}")

# Save feature names for later use
feature_names = list(X.columns)
joblib.dump(feature_names, '../../models/feature_names.pkl')
print("Feature names saved.")