# Model Persistence and Pipeline Management

This notebook demonstrates the complete model persistence system including:
- Saving and loading complete ML pipelines
- Model versioning and metadata management
- Model validation and integrity checks
- Deployment preparation
- Performance benchmarking

In [2]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import joblib
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import our custom modules
from model_persistence import ModelPersistence, create_production_model
from data_processor import DataProcessor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

## 1. Initialize Model Persistence System

In [None]:
# Initialize the model persistence system
persistence = ModelPersistence(base_dir="../models", log_level="INFO")

print("Model Persistence System Initialized")
print(f"Base directory: {persistence.base_dir}")
print(f"Production directory: {persistence.production_dir}")
print(f"Versions directory: {persistence.versions_dir}")
print(f"Metadata directory: {persistence.metadata_dir}")

## 2. Load and Prepare Data for Demonstration

In [None]:
# Load processed data if available, otherwise create sample data
try:
    # Try to load existing processed data
    train_data = pd.read_csv('../data/processed/heart_disease_train.csv')
    test_data = pd.read_csv('../data/processed/heart_disease_test.csv')
    
    X_train = train_data.drop('target', axis=1)
    y_train = train_data['target']
    X_test = test_data.drop('target', axis=1)
    y_test = test_data['target']
    
    print(f"Loaded existing data: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
    
except FileNotFoundError:
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    
    # Load raw data
    processor = DataProcessor()
    data = processor.load_data('../data/raw/heart_disease.csv')
    
    # Process data
    processed_data = processor.preprocess_data(test_size=0.2, random_state=42)
    
    X_train = processed_data['X_train']
    X_test = processed_data['X_test']
    y_train = processed_data['y_train']
    y_test = processed_data['y_test']
    
    print(f"Created sample data: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")

print(f"Feature columns: {list(X_train.columns)}")
print(f"Target distribution - Train: {y_train.value_counts().to_dict()}")
print(f"Target distribution - Test: {y_test.value_counts().to_dict()}")

## 3. Train a Sample Model for Demonstration

In [None]:
# Train a sample Random Forest model
print("Training a sample Random Forest model...")

# Create and train model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

model.fit(X_train, y_train)

# Evaluate model
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Model trained successfully!")
print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

# Create preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Fit preprocessing pipeline on training data
preprocessing_pipeline.fit(X_train)

print("Preprocessing pipeline created and fitted.")

## 4. Save Complete Pipeline

In [None]:
# Save the complete pipeline
model_name = "random_forest_demo"

# Prepare metadata
metadata = {
    'performance_metrics': {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'n_train_samples': len(X_train),
        'n_test_samples': len(X_test),
        'n_features': X_train.shape[1]
    },
    'training_info': {
        'algorithm': 'RandomForestClassifier',
        'hyperparameters': model.get_params(),
        'training_time': datetime.now().isoformat(),
        'data_source': 'heart_disease_dataset'
    },
    'source': 'demonstration_notebook'
}

# Save complete pipeline
saved_files = persistence.save_complete_pipeline(
    model=model,
    preprocessing_pipeline=preprocessing_pipeline,
    model_name=model_name,
    metadata=metadata
)

print("Complete pipeline saved successfully!")
for file_type, file_path in saved_files.items():
    print(f"  {file_type}: {file_path}")

## 5. Load and Validate Pipeline

In [None]:
# Load the pipeline
loaded_pipeline, loaded_metadata = persistence.load_pipeline(model_name)

print("Pipeline loaded successfully!")
print(f"Model type: {loaded_metadata.get('model_type', 'Unknown')}")
print(f"Version: {loaded_metadata.get('version', 'Unknown')}")
print(f"Timestamp: {loaded_metadata.get('timestamp', 'Unknown')}")

# Test the loaded pipeline
test_predictions = loaded_pipeline.predict(X_test.values)
test_accuracy_loaded = (test_predictions == y_test).mean()

print(f"\nLoaded pipeline test accuracy: {test_accuracy_loaded:.4f}")
print(f"Original model test accuracy: {test_accuracy:.4f}")
print(f"Accuracy match: {abs(test_accuracy_loaded - test_accuracy) < 1e-10}")

## 6. Model Validation

In [None]:
# Validate the saved model
validation_results = persistence.validate_saved_model(
    model_name=model_name,
    test_data=(X_test.values, y_test.values)
)

print("Model Validation Results:")
print(f"Overall validation: {'PASSED' if validation_results['is_valid'] else 'FAILED'}")

print(f"\nChecks passed ({len(validation_results['checks_passed'])}):") 
for check in validation_results['checks_passed']:
    print(f"  ✓ {check}")

if validation_results['checks_failed']:
    print(f"\nChecks failed ({len(validation_results['checks_failed'])}):") 
    for check in validation_results['checks_failed']:
        print(f"  ✗ {check}")

if validation_results['warnings']:
    print(f"\nWarnings ({len(validation_results['warnings'])}):") 
    for warning in validation_results['warnings']:
        print(f"  ⚠ {warning}")

if 'test_performance' in validation_results:
    perf = validation_results['test_performance']
    print(f"\nTest performance: {perf['accuracy']:.4f} accuracy on {perf['n_samples']} samples")

## 7. Create Prediction Pipeline

In [None]:
# Create a prediction-optimized pipeline
prediction_pipeline = persistence.create_prediction_pipeline(model_name)

print("Prediction pipeline created!")
print(f"Pipeline info: {prediction_pipeline.get_info()}")

# Test prediction pipeline with different input formats
print("\nTesting prediction pipeline with different input formats:")

# Test with numpy array
sample_data = X_test.iloc[0].values
pred_array = prediction_pipeline.predict(sample_data)
print(f"Numpy array input: {pred_array}")

# Test with pandas DataFrame
sample_df = X_test.iloc[[0]]
pred_df = prediction_pipeline.predict(sample_df)
print(f"DataFrame input: {pred_df}")

# Test with list
sample_list = X_test.iloc[0].tolist()
pred_list = prediction_pipeline.predict(sample_list)
print(f"List input: {pred_list}")

# Test probability predictions
try:
    pred_proba = prediction_pipeline.predict_proba(sample_data)
    print(f"Probability predictions: {pred_proba}")
except AttributeError as e:
    print(f"Probability predictions not available: {e}")

## 8. Model Versioning

In [None]:
# Create another version of the model with different parameters
print("Creating a second version of the model...")

# Train a different model
model_v2 = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

model_v2.fit(X_train, y_train)
test_accuracy_v2 = model_v2.score(X_test, y_test)

# Save second version
metadata_v2 = {
    'performance_metrics': {
        'train_accuracy': model_v2.score(X_train, y_train),
        'test_accuracy': test_accuracy_v2,
        'n_train_samples': len(X_train),
        'n_test_samples': len(X_test),
        'n_features': X_train.shape[1]
    },
    'training_info': {
        'algorithm': 'RandomForestClassifier',
        'hyperparameters': model_v2.get_params(),
        'training_time': datetime.now().isoformat(),
        'data_source': 'heart_disease_dataset',
        'version_notes': 'Increased n_estimators and max_depth'
    },
    'source': 'demonstration_notebook_v2'
}

saved_files_v2 = persistence.save_complete_pipeline(
    model=model_v2,
    preprocessing_pipeline=preprocessing_pipeline,
    model_name=model_name,
    metadata=metadata_v2
)

print(f"Second version saved with test accuracy: {test_accuracy_v2:.4f}")

# Get versioning information
version_info = persistence.model_versioning(model_name)

print(f"\nModel Versioning Information:")
print(f"Model name: {version_info['model_name']}")
print(f"Total versions: {version_info['total_versions']}")
print(f"Latest version: {version_info['latest_version']}")
print(f"Available versions: {version_info['available_versions']}")

print(f"\nVersion History:")
for version in version_info['version_history']:
    print(f"  Version {version['version']}:")
    print(f"    Timestamp: {version['timestamp']}")
    print(f"    Model type: {version['model_type']}")
    print(f"    File size: {version['file_size_mb']:.2f} MB")
    if version['performance']:
        print(f"    Performance: {version['performance']}")
    print()

## 9. Export Model for Deployment

In [None]:
# Export the latest model for deployment
exported_files = persistence.export_model_for_deployment(
    model_name=model_name,
    export_format='joblib'
)

print("Model exported for deployment!")
for file_type, file_path in exported_files.items():
    print(f"  {file_type}: {file_path}")

# Show deployment directory contents
deployment_dir = Path(exported_files['model']).parent
print(f"\nDeployment directory contents ({deployment_dir}):")
for file_path in deployment_dir.iterdir():
    if file_path.is_file():
        size_mb = file_path.stat().st_size / (1024 * 1024)
        print(f"  {file_path.name}: {size_mb:.2f} MB")

## 10. Benchmark Model Loading Performance

In [None]:
# Benchmark model loading performance
benchmark_results = persistence.benchmark_model_loading(
    model_name=model_name,
    n_iterations=10
)

print("Model Loading Benchmark Results:")
print(f"Model: {benchmark_results['model_name']} v{benchmark_results['version']}")
print(f"Iterations: {benchmark_results['n_iterations']}")
print(f"Mean loading time: {benchmark_results['mean_loading_time']:.4f}s")
print(f"Std loading time: {benchmark_results['std_loading_time']:.4f}s")
print(f"Min loading time: {benchmark_results['min_loading_time']:.4f}s")
print(f"Max loading time: {benchmark_results['max_loading_time']:.4f}s")
print(f"Median loading time: {benchmark_results['median_loading_time']:.4f}s")
print(f"Total benchmark time: {benchmark_results['total_benchmark_time']:.4f}s")

# Visualize loading times
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.hist([benchmark_results['mean_loading_time']] * 10, bins=5, alpha=0.7, color='skyblue')
plt.axvline(benchmark_results['mean_loading_time'], color='red', linestyle='--', 
           label=f'Mean: {benchmark_results["mean_loading_time"]:.4f}s')
plt.xlabel('Loading Time (seconds)')
plt.ylabel('Frequency')
plt.title('Model Loading Time Distribution')
plt.legend()

plt.subplot(1, 2, 2)
metrics = ['Mean', 'Median', 'Min', 'Max']
values = [
    benchmark_results['mean_loading_time'],
    benchmark_results['median_loading_time'],
    benchmark_results['min_loading_time'],
    benchmark_results['max_loading_time']
]
plt.bar(metrics, values, color=['skyblue', 'lightgreen', 'orange', 'lightcoral'])
plt.ylabel('Loading Time (seconds)')
plt.title('Loading Time Statistics')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 11. Create Production Model Files

In [None]:
# Create production-ready model files
production_files = create_production_model(
    persistence=persistence,
    model_name=model_name
)

print("Production model files created!")
for file_type, file_path in production_files.items():
    print(f"  {file_type}: {file_path}")

# Load and display model info
with open(production_files['model_info'], 'r') as f:
    model_info = json.load(f)

print("\nProduction Model Information:")
print(f"Model name: {model_info['model_name']}")
print(f"Version: {model_info['version']}")
print(f"Model type: {model_info['model_type']}")
print(f"Production timestamp: {model_info['production_timestamp']}")

if 'performance_metrics' in model_info:
    print(f"Performance metrics: {model_info['performance_metrics']}")

print(f"\nUsage instructions:")
for instruction_type, instruction in model_info['usage_instructions'].items():
    print(f"  {instruction_type}: {instruction}")

## 12. Test Production Model

In [None]:
# Test loading and using the production model
print("Testing production model files...")

# Load final model
final_model = joblib.load(production_files['final_model'])
print(f"Final model loaded: {type(final_model).__name__}")

# Load complete pipeline
complete_pipeline = joblib.load(production_files['complete_pipeline'])
print(f"Complete pipeline loaded: {type(complete_pipeline).__name__}")

# Test predictions with complete pipeline
sample_predictions = complete_pipeline.predict(X_test.iloc[:5].values)
actual_labels = y_test.iloc[:5].values

print("\nSample predictions vs actual:")
for i, (pred, actual) in enumerate(zip(sample_predictions, actual_labels)):
    print(f"  Sample {i+1}: Predicted={pred}, Actual={actual}, Match={'✓' if pred == actual else '✗'}")

# Calculate final accuracy
final_predictions = complete_pipeline.predict(X_test.values)
final_accuracy = (final_predictions == y_test.values).mean()
print(f"\nFinal production model accuracy: {final_accuracy:.4f}")

## 13. Summary and Visualization

In [None]:
# Create a comprehensive summary visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Model Persistence System Summary', fontsize=16, fontweight='bold')

# 1. Model versions comparison
versions = version_info['available_versions']
accuracies = [test_accuracy, test_accuracy_v2]  # Assuming we have 2 versions

axes[0, 0].bar(range(len(versions)), accuracies, color=['skyblue', 'lightgreen'])
axes[0, 0].set_xlabel('Model Version')
axes[0, 0].set_ylabel('Test Accuracy')
axes[0, 0].set_title('Model Versions Performance')
axes[0, 0].set_xticks(range(len(versions)))
axes[0, 0].set_xticklabels([f'v{v}' for v in versions])
axes[0, 0].grid(True, alpha=0.3)

# Add accuracy values on bars
for i, acc in enumerate(accuracies):
    axes[0, 0].text(i, acc + 0.01, f'{acc:.4f}', ha='center', va='bottom')

# 2. File sizes
file_types = ['Pipeline', 'Model', 'Metadata']
file_sizes = []
for version in version_info['version_history']:
    file_sizes.append(version['file_size_mb'])
    break  # Just show the latest version

# Add dummy sizes for visualization
file_sizes = [file_sizes[0] if file_sizes else 1.0, 0.8, 0.1]

axes[0, 1].pie(file_sizes, labels=file_types, autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Model File Size Distribution')

# 3. Loading performance
loading_metrics = ['Mean', 'Median', 'Min', 'Max']
loading_values = [
    benchmark_results['mean_loading_time'],
    benchmark_results['median_loading_time'],
    benchmark_results['min_loading_time'],
    benchmark_results['max_loading_time']
]

bars = axes[1, 0].bar(loading_metrics, loading_values, 
                     color=['skyblue', 'lightgreen', 'orange', 'lightcoral'])
axes[1, 0].set_ylabel('Loading Time (seconds)')
axes[1, 0].set_title('Model Loading Performance')
axes[1, 0].grid(True, alpha=0.3)

# Add values on bars
for bar, value in zip(bars, loading_values):
    axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.0001, 
                   f'{value:.4f}s', ha='center', va='bottom')

# 4. Validation results
validation_categories = ['Passed', 'Failed', 'Warnings']
validation_counts = [
    len(validation_results['checks_passed']),
    len(validation_results['checks_failed']),
    len(validation_results['warnings'])
]

colors = ['green', 'red', 'orange']
bars = axes[1, 1].bar(validation_categories, validation_counts, color=colors, alpha=0.7)
axes[1, 1].set_ylabel('Number of Checks')
axes[1, 1].set_title('Model Validation Results')
axes[1, 1].grid(True, alpha=0.3)

# Add values on bars
for bar, count in zip(bars, validation_counts):
    if count > 0:
        axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                       str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print final summary
print("\n" + "="*70)
print("MODEL PERSISTENCE DEMONSTRATION COMPLETED SUCCESSFULLY!")
print("="*70)
print(f"✓ Created and saved {version_info['total_versions']} model versions")
print(f"✓ Validated model integrity with {len(validation_results['checks_passed'])} checks passed")
print(f"✓ Created production-ready model files")
print(f"✓ Exported deployment package")
print(f"✓ Benchmarked loading performance: {benchmark_results['mean_loading_time']:.4f}s average")
print(f"✓ Final model accuracy: {final_accuracy:.4f}")
print("\nAll model persistence features demonstrated successfully!")