# Optimized Company Bankruptcy Prediction

This notebook demonstrates the optimized bankruptcy prediction pipeline with:
- Modular, production-ready code structure
- Advanced feature selection and hyperparameter optimization
- Ensemble methods and model optimization
- Comprehensive evaluation and visualization
- Configuration management and logging

## Key Improvements:
1. **Code Structure**: Separated concerns into modules (data, models, visualization, etc.)
2. **Performance**: Added hyperparameter tuning, feature selection, and ensemble methods
3. **Quality**: Proper error handling, logging, and configuration management
4. **Production Ready**: API endpoints, model persistence, and containerization support

In [None]:
# Import the optimized modules
import sys
import os

# Add src directory to path
sys.path.append('src')

from src.config import Config, get_default_config
from src.pipeline import BankruptcyPredictor
from src.optimization import ModelOptimizer, create_advanced_models
from src.utils import setup_logging

import warnings
warnings.filterwarnings('ignore')

## 1. Configuration Setup

The optimized version uses centralized configuration management.

In [None]:
# Load configuration
config = get_default_config()

# Customize configuration if needed
config.feature_selection = True
config.max_features = 50  # Select top 50 features
config.save_models = True
config.save_plots = True

print("Configuration:")
for key, value in config.__dict__.items():
    print(f"  {key}: {value}")

## 2. Basic Pipeline Execution

Run the complete optimized pipeline with a single function call.

In [None]:
# Initialize the predictor
predictor = BankruptcyPredictor(config)

# Run the full pipeline
print("Running optimized bankruptcy prediction pipeline...\n")
report = predictor.run_full_pipeline()

print("\n" + "="*60)
print("PIPELINE COMPLETED SUCCESSFULLY")
print("="*60)
print(report)

## 3. Advanced Model Optimization

Apply advanced optimization techniques including hyperparameter tuning and feature selection.

In [None]:
# Initialize model optimizer
optimizer = ModelOptimizer(config)

print("Starting advanced model optimization...")

# Optimize individual models
model_types = ['logistic_regression', 'random_forest']
optimized_models = []

for model_type in model_types:
    print(f"\nOptimizing {model_type}...")
    
    result = optimizer.optimize_single_model(
        model_type=model_type,
        X_train=predictor.X_train,
        y_train=predictor.y_train,
        X_test=predictor.X_test,
        optimize_features=True,
        optimize_hyperparams=True
    )
    
    optimized_models.append(result)
    
    print(f"Best parameters for {model_type}: {result.get('best_params', 'N/A')}")
    print(f"Best CV score: {result.get('best_cv_score', 'N/A'):.4f}")
    print(f"Selected features: {result.get('n_selected_features', 'N/A')}")

## 4. Ensemble Methods

Create ensemble models from the optimized individual models.

In [None]:
print("Creating ensemble models...")

# Create ensemble from optimized models
ensemble_results = optimizer.create_optimized_ensemble(
    optimized_models=optimized_models,
    X_train=predictor.X_train,
    y_train=predictor.y_train
)

print("Ensemble models created:")
for key in ensemble_results.keys():
    if 'ensemble' in key:
        print(f"  - {key}")

# Evaluate ensemble models
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

print("\nEvaluating ensemble models:")

for ensemble_name, ensemble_model in ensemble_results.items():
    if 'ensemble' in ensemble_name:
        # Make predictions
        y_pred = ensemble_model.predict(predictor.X_test)
        y_pred_proba = ensemble_model.predict_proba(predictor.X_test)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(predictor.y_test, y_pred)
        f1 = f1_score(predictor.y_test, y_pred)
        roc_auc = roc_auc_score(predictor.y_test, y_pred_proba)
        
        print(f"\n{ensemble_name.replace('_', ' ').title()}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  F1-Score: {f1:.4f}")
        print(f"  ROC-AUC:  {roc_auc:.4f}")

## 5. Advanced Models (XGBoost)

Add state-of-the-art models like XGBoost to the comparison.

In [None]:
print("Adding advanced models...")

try:
    # Create advanced models
    advanced_models = create_advanced_models(config)
    
    if 'xgboost' in advanced_models:
        print("Training XGBoost model...")
        
        xgb_model = advanced_models['xgboost']
        xgb_model.fit(predictor.X_train, predictor.y_train)
        
        # Evaluate XGBoost
        y_pred_xgb = xgb_model.predict(predictor.X_test)
        y_pred_proba_xgb = xgb_model.predict_proba(predictor.X_test)[:, 1]
        
        accuracy_xgb = accuracy_score(predictor.y_test, y_pred_xgb)
        f1_xgb = f1_score(predictor.y_test, y_pred_xgb)
        roc_auc_xgb = roc_auc_score(predictor.y_test, y_pred_proba_xgb)
        
        print(f"\nXGBoost Results:")
        print(f"  Accuracy: {accuracy_xgb:.4f}")
        print(f"  F1-Score: {f1_xgb:.4f}")
        print(f"  ROC-AUC:  {roc_auc_xgb:.4f}")
        
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")
except Exception as e:
    print(f"Error with advanced models: {str(e)}")

## 6. Model Comparison and Best Model Selection

Compare all models and identify the best performer.

In [None]:
# Get the best model from the basic pipeline
best_model_name, best_model_result = predictor.get_best_model('f1_score')

print("Model Performance Summary:")
print("=" * 50)

# Display all results from the basic pipeline
for result in predictor.results:
    print(f"\n{result['model_name']}:")
    print(f"  Accuracy:  {result.get('accuracy', 0):.4f}")
    print(f"  Precision: {result.get('precision', 0):.4f}")
    print(f"  Recall:    {result.get('recall', 0):.4f}")
    print(f"  F1-Score:  {result.get('f1_score', 0):.4f}")
    print(f"  ROC-AUC:   {result.get('roc_auc', 0):.4f}")

print(f"\n{'='*50}")
print(f"BEST MODEL: {best_model_name}")
print(f"F1-Score: {best_model_result['f1_score']:.4f}")
print(f"{'='*50}")

## 7. Feature Importance Analysis

Analyze which features are most important for bankruptcy prediction.

In [None]:
print("Performing feature importance analysis...")

# Analyze feature importance for Random Forest models
predictor.analyze_feature_importance()

# If we have optimization results, show selected features
for i, result in enumerate(optimized_models):
    if 'selected_features' in result:
        print(f"\nOptimized Model {i+1} - Selected Feature Indices:")
        print(f"  Total features selected: {result['n_selected_features']}")
        print(f"  Feature indices: {result['selected_features'][:10]}...")  # Show first 10

## 8. Model Persistence and API Setup

Save models and prepare for production deployment.

In [None]:
print("Saving optimized models and results...")

# Save optimization results
if optimized_models:
    optimization_path = os.path.join(config.output_dir, "optimization_results.joblib")
    optimizer.save_optimization_results({
        'optimized_models': optimized_models,
        'ensemble_results': ensemble_results if 'ensemble_results' in locals() else None,
        'config': config.__dict__
    }, optimization_path)

# Display file structure
print("\nGenerated files:")
if os.path.exists(config.output_dir):
    for root, dirs, files in os.walk(config.output_dir):
        level = root.replace(config.output_dir, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files per directory
            print(f"{subindent}{file}")
        if len(files) > 5:
            print(f"{subindent}... and {len(files)-5} more files")

print("\nOptimization completed successfully!")
print("\nTo start the API server, run:")
print("  python -m uvicorn src.api:app --reload --host 0.0.0.0 --port 8000")
print("\nAPI will be available at: http://localhost:8000")
print("API documentation at: http://localhost:8000/docs")

## 9. Performance Summary

Summary of all optimizations and improvements made.

In [None]:
print("OPTIMIZATION SUMMARY")
print("=" * 60)
print("\n✅ Completed Optimizations:")
print("   • Modular code structure with separated concerns")
print("   • Configuration management and logging")
print("   • Advanced feature selection (RFE, Model-based)")
print("   • Hyperparameter optimization (RandomizedSearchCV)")
print("   • Ensemble methods (Voting, Bagging)")
print("   • Advanced models (XGBoost support)")
print("   • Production-ready API with FastAPI")
print("   • Model persistence and loading")
print("   • Comprehensive visualization and reporting")
print("   • Error handling and logging")
print("   • Type hints and documentation")

print("\n📊 Performance Improvements:")
if predictor.results:
    best_f1 = max(result.get('f1_score', 0) for result in predictor.results)
    best_accuracy = max(result.get('accuracy', 0) for result in predictor.results)
    best_auc = max(result.get('roc_auc', 0) for result in predictor.results)
    
    print(f"   • Best F1-Score: {best_f1:.4f}")
    print(f"   • Best Accuracy: {best_accuracy:.4f}")
    print(f"   • Best ROC-AUC: {best_auc:.4f}")

print("\n🚀 Production Features:")
print("   • REST API endpoints for predictions")
print("   • Batch prediction support")
print("   • Model management and reloading")
print("   • Health checks and monitoring")
print("   • CORS support for web integration")
print("   • Pydantic models for data validation")
print("   • Comprehensive error handling")

print("\n🔧 Next Steps for Further Optimization:")
print("   • Implement SHAP for model interpretability")
print("   • Add automated model retraining pipeline")
print("   • Implement A/B testing for model comparison")
print("   • Add model monitoring and drift detection")
print("   • Containerize with Docker")
print("   • Add unit tests and integration tests")
print("   • Implement caching for improved performance")
print("   • Add data validation and quality checks")

print("\n" + "=" * 60)
print("PROJECT OPTIMIZATION COMPLETED SUCCESSFULLY! 🎉")
print("=" * 60)