# Model Factory Demo

This notebook demonstrates the usage of the ModelFactory for Titanic survival prediction.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# Add the project root to path so we can import modules
sys.path.append(os.path.abspath('../'))

# Import project modules
from src.data_processing.data_loader import DataLoader
from src.data_processing.data_preprocessor import DataPreprocessor
from src.modelling.model_factory import ModelFactory

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load data
data_loader = DataLoader()
train_data = data_loader.load_train_data()
test_data = data_loader.load_test_data()

# Display basic information
print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# Preview the training data
train_data.head()

## 2. Data Preprocessing

In [None]:
# Preprocess the data
preprocessor = DataPreprocessor()

# Get combined data for consistent preprocessing
combined_data, n_train_samples = data_loader.get_combined_data()

# Fit the preprocessor on the combined data
processed_data = preprocessor.fit_transform(combined_data)

# Split back into train and test
X_train = processed_data[:n_train_samples].drop('Survived', axis=1)
y_train = processed_data[:n_train_samples]['Survived']
X_test = processed_data[n_train_samples:]

# Further split training data for model evaluation
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_val shape: {X_val.shape}")

## 3. Explore Model Factory

In [None]:
# Get available models
available_models = ModelFactory.get_available_models()
print(f"Available models: {available_models}")

In [None]:
# Get default parameters for a model
for model_type in available_models:
    params = ModelFactory.get_model_default_params(model_type)
    print(f"\nDefault parameters for {model_type}:")
    for param, value in params.items():
        print(f"  {param}: {value}")

## 4. Train and Evaluate a Basic Model

In [None]:
# Create a logistic regression model
log_reg_model = ModelFactory.create_model('logistic_regression')

# Train the model
log_reg_model.fit(X_train_final, y_train_final)

# Evaluate on validation set
metrics = log_reg_model.evaluate(X_val, y_val)

print("Logistic Regression Metrics:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Get feature importance
feature_importance = log_reg_model.get_feature_importance(feature_names=X_train.columns)

# Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='coefficient', y='feature', data=feature_importance.head(15), palette='viridis')
plt.title('Logistic Regression Feature Importance (Top 15)')
plt.xlabel('Coefficient Magnitude')
plt.tight_layout()
plt.show()

## 5. Create and Compare Multiple Models

In [None]:
# Dictionary to store models and their metrics
models = {}
model_metrics = {}
model_predictions = {}
model_probabilities = {}

# Train and evaluate all models
for model_type in available_models:
    print(f"Training {model_type}...")
    
    # Create and train model
    model = ModelFactory.create_model(model_type)
    model.fit(X_train_final, y_train_final)
    models[model_type] = model
    
    # Evaluate model
    metrics = model.evaluate(X_val, y_val)
    model_metrics[model_type] = metrics
    
    # Store predictions and probabilities
    model_predictions[model_type] = model.predict(X_val)
    model_probabilities[model_type] = model.predict_proba(X_val)[:, 1]
    
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  ROC AUC: {metrics['roc_auc']:.4f}\n")

In [None]:
# Create a comparison dataframe
comparison_data = []
for model_type, metrics in model_metrics.items():
    row = {'model': model_type}
    row.update(metrics)
    comparison_data.append(row)
    
comparison_df = pd.DataFrame(comparison_data)

# Display the comparison
comparison_df = comparison_df.sort_values('accuracy', ascending=False).reset_index(drop=True)
comparison_df

In [None]:
# Plot metrics comparison
plt.figure(figsize=(14, 8))

metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
colors = sns.color_palette('viridis', len(metrics_to_plot))

bar_width = 0.15
index = np.arange(len(comparison_df))

for i, metric in enumerate(metrics_to_plot):
    plt.bar(index + i * bar_width, comparison_df[metric], bar_width, 
            label=metric.replace('_', ' ').title(), color=colors[i])

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(index + bar_width * 2, comparison_df['model'])
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))

for model_type in model_probabilities.keys():
    fpr, tpr, _ = roc_curve(y_val, model_probabilities[model_type])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{model_type} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

## 6. Hyperparameter Tuning Demo

In [None]:
# Select the best performing model (based on ROC AUC) for hyperparameter tuning
best_model_type = comparison_df.sort_values('roc_auc', ascending=False).iloc[0]['model']
print(f"Tuning hyperparameters for {best_model_type}...")

# Create a fresh model
best_model = ModelFactory.create_model(best_model_type)

# Get default parameter grid
param_grid = best_model.get_param_grid()
print(f"Parameter grid: {param_grid}")

# Tune hyperparameters (may take some time)
best_model.tune_hyperparameters(X_train_final, y_train_final, cv=3, scoring='roc_auc')

# Evaluate tuned model
tuned_metrics = best_model.evaluate(X_val, y_val)
print("\nTuned Model Metrics:")
for metric, value in tuned_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Compare tuned model with original models
original_metrics = model_metrics[best_model_type]
metric_comparison = pd.DataFrame({
    'Original': pd.Series(original_metrics),
    'Tuned': pd.Series(tuned_metrics)
})

print(f"Performance comparison for {best_model_type}:")
metric_comparison

## 7. Final Model Selection and Kaggle Submission Preparation

In [None]:
# Select the best model (tuned or from the original set)
if tuned_metrics['roc_auc'] > original_metrics['roc_auc']:
    final_model = best_model
    print(f"Selected the tuned {best_model_type} as the final model")
else:
    # Find the best original model
    best_original_type = comparison_df.iloc[0]['model']
    final_model = models[best_original_type]
    print(f"Selected the original {best_original_type} as the final model")

# Train the final model on the full training set
final_model.fit(X_train, y_train)

# Generate predictions for the test set
test_predictions = final_model.predict(X_test)

# Create a Kaggle submission
submission_df = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions.astype(int)
})

# Save the submission
submission_path = '../kaggle_submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")
submission_df.head(10)

## 8. Feature Importance Analysis

In [None]:
# Get feature importance from the final model
feature_importance = final_model.get_feature_importance(feature_names=X_train.columns)

if feature_importance is not None:
    # Plot feature importance
    plt.figure(figsize=(12, 10))
    
    # Determine column to use based on the model type
    if 'importance' in feature_importance.columns:
        value_col = 'importance'
        title = 'Feature Importance'
    else:
        value_col = 'coefficient'
        feature_importance = feature_importance.copy()
        feature_importance[value_col] = abs(feature_importance[value_col])  # Use absolute values for coefficients
        title = 'Feature Coefficient Magnitude'
        
    # Sort and plot the top 20 features
    top_features = feature_importance.sort_values(value_col, ascending=False).head(20)
    sns.barplot(x=value_col, y='feature', data=top_features, palette='viridis')
    plt.title(f'Top 20 Features by {title}')
    plt.xlabel(title)
    plt.tight_layout()
    plt.show()
    
    # Display the full table
    feature_importance.sort_values(value_col, ascending=False).head(20)
else:
    print("Feature importance not available for this model type.")

## 9. Conclusion

In this notebook, we demonstrated the use of the ModelFactory to create, train, and evaluate different machine learning models for Titanic survival prediction. We:

1. Created and compared multiple model types
2. Visualized model performance metrics
3. Performed hyperparameter tuning on the best model
4. Generated a Kaggle submission file
5. Analyzed feature importance

The ModelFactory provides a standardized interface for working with different models, making it easy to experiment with various algorithms and find the best performing model for our task.