# 📈 Healthcare Chatbot - Model Evaluation

In this notebook, you'll:
1. Load your trained healthcare chatbot
2. Run comprehensive evaluation metrics
3. Test conversational quality
4. Analyze performance and get improvement suggestions
5. Compare with baseline models

Let's see how well your AI doctor performs! 🩺📊

## 🔧 Step 1: Setup and Model Loading

First, let's set up the environment and load your trained model.

In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('/workspace/src')

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("🔧 Evaluation environment setup complete!")

In [None]:
# Find available trained models
def find_trained_models():
    """Find trained model directories."""
    models_dir = '/workspace/models'
    trained_models = []
    
    if os.path.exists(models_dir):
        for item in os.listdir(models_dir):
            model_path = os.path.join(models_dir, item)
            if os.path.isdir(model_path):
                # Look for final_model subdirectory
                final_model_path = os.path.join(model_path, 'final_model')
                if os.path.exists(final_model_path):
                    trained_models.append(final_model_path)
                # Or check if it's directly a model directory
                elif any(f.endswith('.bin') or f.endswith('.safetensors') for f in os.listdir(model_path)):
                    trained_models.append(model_path)
    
    return trained_models

print("🔍 Searching for trained models...")
available_models = find_trained_models()

if available_models:
    print(f"📁 Found {len(available_models)} trained model(s):")
    for i, model_path in enumerate(available_models, 1):
        # Get model info
        model_name = os.path.basename(os.path.dirname(model_path)) if model_path.endswith('final_model') else os.path.basename(model_path)
        
        # Check for training summary
        summary_path = os.path.join(os.path.dirname(model_path), 'kaggle_training_summary.json')
        if not os.path.exists(summary_path):
            summary_path = os.path.join(os.path.dirname(model_path), 'training_summary.json')
        
        timestamp = "Unknown"
        if os.path.exists(summary_path):
            try:
                with open(summary_path, 'r') as f:
                    summary = json.load(f)
                timestamp = summary.get('timestamp', 'Unknown')
                if timestamp != 'Unknown':
                    timestamp = datetime.fromisoformat(timestamp).strftime('%Y-%m-%d %H:%M')
            except:
                pass
        
        print(f"   {i}. {model_name} (trained: {timestamp})")
        print(f"      Path: {model_path}")
else:
    print("❌ No trained models found.")
    print("\n🔧 SOLUTIONS:")
    print("1. Train a model first using notebook 03_Model_Training.ipynb")
    print("2. Or run: python train_kaggle_chatbot.py --kaggle_dataset_path your-dataset.csv")
    print("3. Check that training completed successfully")

In [None]:
# Select model for evaluation
# MODIFY THIS: Set the path to your trained model
MODEL_PATH = None

# Auto-select the most recent model if available
if available_models:
    MODEL_PATH = available_models[0]  # Use the first (most recent) model
    print(f"🤖 Auto-selected model: {MODEL_PATH}")

# Or manually set the path
# MODEL_PATH = '/workspace/models/my_healthcare_chatbot/final_model'

if MODEL_PATH and os.path.exists(MODEL_PATH):
    print(f"✅ Model selected for evaluation: {os.path.basename(os.path.dirname(MODEL_PATH))}")
    
    # Load model info if available
    model_info = {}
    summary_path = os.path.join(os.path.dirname(MODEL_PATH), 'kaggle_training_summary.json')
    if not os.path.exists(summary_path):
        summary_path = os.path.join(os.path.dirname(MODEL_PATH), 'training_summary.json')
    
    if os.path.exists(summary_path):
        with open(summary_path, 'r') as f:
            model_info = json.load(f)
        print("📋 Training information loaded")
    
    model_selected = True
else:
    print("❌ Please set MODEL_PATH to your trained model directory.")
    print("   Example: MODEL_PATH = '/workspace/models/my_healthcare_chatbot/final_model'")
    model_selected = False
    model_info = {}

In [None]:
# Load the trained model
if model_selected:
    print("🔄 LOADING TRAINED MODEL")
    print("="*35)
    
    try:
        from evaluation import HealthcareEvaluator
        from chatbot import HealthcareChatbot
        
        # Initialize evaluator
        print("🔄 Initializing evaluator...")
        evaluator = HealthcareEvaluator(MODEL_PATH)
        
        # Initialize chatbot for interactive testing
        print("🔄 Initializing chatbot...")
        chatbot = HealthcareChatbot(MODEL_PATH)
        
        print("✅ Model loaded successfully!")
        
        # Display model information
        if model_info:
            print("\n📊 Model Information:")
            training_config = model_info.get('training_config', {})
            if training_config:
                print(f"   Model type: {training_config.get('model_key', 'Unknown')}")
                print(f"   Training samples: {model_info.get('dataset_stats', {}).get('training_samples', 'Unknown')}")
                print(f"   Validation samples: {model_info.get('dataset_stats', {}).get('validation_samples', 'Unknown')}")
                
                hyperparams = training_config.get('hyperparameters', {})
                if hyperparams:
                    print(f"   Learning rate: {hyperparams.get('learning_rate', 'Unknown')}")
                    print(f"   Epochs: {hyperparams.get('num_train_epochs', 'Unknown')}")
                    print(f"   Batch size: {hyperparams.get('per_device_train_batch_size', 'Unknown')}")
        
        model_loaded = True
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("\n🔧 TROUBLESHOOTING:")
        print("1. Check that the model path is correct")
        print("2. Ensure training completed successfully")
        print("3. Verify model files exist in the directory")
        
        model_loaded = False
        evaluator = None
        chatbot = None
        
else:
    print("⚠️ Cannot load model - no model selected")
    model_loaded = False
    evaluator = None
    chatbot = None

## 📊 Step 2: Quantitative Evaluation

Let's run comprehensive metrics on your model's performance.

In [None]:
# Load evaluation dataset
if model_loaded:
    print("📊 PREPARING EVALUATION DATASET")
    print("="*40)
    
    # Try to find the dataset used for training
    dataset_paths = [
        '/workspace/data/kaggle_medical_dataset.json',
        '/workspace/data/healthcare_qa_dataset.json'
    ]
    
    eval_dataset = None
    dataset_path = None
    
    for path in dataset_paths:
        if os.path.exists(path):
            try:
                with open(path, 'r') as f:
                    eval_dataset = json.load(f)
                dataset_path = path
                break
            except:
                continue
    
    if eval_dataset:
        print(f"✅ Evaluation dataset loaded: {os.path.basename(dataset_path)}")
        print(f"📊 Total samples: {len(eval_dataset)}")
        
        # Limit samples for faster evaluation if dataset is large
        max_eval_samples = 100
        if len(eval_dataset) > max_eval_samples:
            eval_dataset = eval_dataset[:max_eval_samples]
            print(f"📊 Limited to {max_eval_samples} samples for faster evaluation")
        
        dataset_ready = True
    else:
        print("❌ No evaluation dataset found")
        print("\n🔧 SOLUTIONS:")
        print("1. Make sure you've processed your dataset (notebook 02)")
        print("2. Check dataset paths above")
        print("3. You can still do qualitative evaluation below")
        
        dataset_ready = False
        eval_dataset = None
else:
    print("⚠️ Model not loaded - cannot prepare evaluation")
    dataset_ready = False
    eval_dataset = None

In [None]:
# Run quantitative evaluation
if model_loaded and dataset_ready:
    print("🧪 RUNNING QUANTITATIVE EVALUATION")
    print("="*45)
    print("⏱️ This may take a few minutes...")
    
    try:
        # Save dataset temporarily for evaluator
        temp_dataset_path = '/tmp/eval_dataset.json'
        with open(temp_dataset_path, 'w') as f:
            json.dump(eval_dataset, f)
        
        # Run evaluation
        eval_results = evaluator.evaluate_on_dataset(
            temp_dataset_path,
            num_samples=len(eval_dataset)
        )
        
        print("\n✅ EVALUATION COMPLETED!")
        print("="*30)
        
        # Display key metrics
        metrics = {
            'BLEU Score': eval_results.get('corpus_bleu', 0),
            'ROUGE-1 F1': eval_results.get('rouge1_f1', 0),
            'ROUGE-2 F1': eval_results.get('rouge2_f1', 0),
            'ROUGE-L F1': eval_results.get('rougeL_f1', 0),
            'Semantic Similarity': eval_results.get('semantic_similarity', 0),
            'Perplexity': eval_results.get('perplexity', 0),
            'Avg Prediction Length': eval_results.get('avg_prediction_length', 0),
            'Avg Reference Length': eval_results.get('avg_reference_length', 0)
        }
        
        print("📊 PERFORMANCE METRICS:")
        print("-"*30)
        for metric, value in metrics.items():
            if isinstance(value, float):
                print(f"{metric:.<25} {value:.4f}")
            else:
                print(f"{metric:.<25} {value}")
        
        # Performance assessment
        print("\n🎯 PERFORMANCE ASSESSMENT:")
        print("-"*35)
        
        bleu_score = eval_results.get('corpus_bleu', 0)
        rouge_l = eval_results.get('rougeL_f1', 0)
        similarity = eval_results.get('semantic_similarity', 0)
        
        assessments = []
        
        # BLEU assessment
        if bleu_score > 0.3:
            assessments.append("🟢 BLEU: Excellent (>0.30)")
        elif bleu_score > 0.2:
            assessments.append("🟡 BLEU: Good (0.20-0.30)")
        elif bleu_score > 0.1:
            assessments.append("🟠 BLEU: Fair (0.10-0.20)")
        else:
            assessments.append("🔴 BLEU: Needs improvement (<0.10)")
        
        # ROUGE assessment
        if rouge_l > 0.4:
            assessments.append("🟢 ROUGE-L: Excellent (>0.40)")
        elif rouge_l > 0.3:
            assessments.append("🟡 ROUGE-L: Good (0.30-0.40)")
        elif rouge_l > 0.2:
            assessments.append("🟠 ROUGE-L: Fair (0.20-0.30)")
        else:
            assessments.append("🔴 ROUGE-L: Needs improvement (<0.20)")
        
        # Similarity assessment
        if similarity > 0.4:
            assessments.append("🟢 Similarity: Excellent (>0.40)")
        elif similarity > 0.3:
            assessments.append("🟡 Similarity: Good (0.30-0.40)")
        elif similarity > 0.2:
            assessments.append("🟠 Similarity: Fair (0.20-0.30)")
        else:
            assessments.append("🔴 Similarity: Needs improvement (<0.20)")
        
        for assessment in assessments:
            print(assessment)
        
        # Overall score
        overall_score = (bleu_score + rouge_l + similarity) / 3
        print(f"\n📊 Overall Score: {overall_score:.3f}")
        
        if overall_score > 0.35:
            print("🎉 Excellent performance! Your chatbot is working very well.")
        elif overall_score > 0.25:
            print("👍 Good performance! Your chatbot is functioning well.")
        elif overall_score > 0.15:
            print("👌 Fair performance. Consider improvements.")
        else:
            print("🔧 Performance needs improvement. See suggestions below.")
        
        quantitative_completed = True
        
        # Clean up
        if os.path.exists(temp_dataset_path):
            os.remove(temp_dataset_path)
        
    except Exception as e:
        print(f"❌ Error during evaluation: {e}")
        print("\n🔧 TROUBLESHOOTING:")
        print("1. Check that the model is properly loaded")
        print("2. Reduce evaluation dataset size")
        print("3. Check available memory")
        
        quantitative_completed = False
        eval_results = {}
        
else:
    print("⚠️ Cannot run quantitative evaluation - prerequisites not met")
    quantitative_completed = False
    eval_results = {}

In [None]:
# Visualize evaluation results
if quantitative_completed and eval_results:
    print("📊 CREATING EVALUATION VISUALIZATIONS")
    print("="*45)
    
    # Create metrics visualization
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Healthcare Chatbot Evaluation Results', fontsize=16, fontweight='bold')
    
    # 1. Main metrics bar chart
    main_metrics = {
        'BLEU': eval_results.get('corpus_bleu', 0),
        'ROUGE-L': eval_results.get('rougeL_f1', 0),
        'Similarity': eval_results.get('semantic_similarity', 0)
    }
    
    bars = axes[0, 0].bar(main_metrics.keys(), main_metrics.values(), 
                         color=['skyblue', 'lightgreen', 'lightcoral'])
    axes[0, 0].set_title('Main Performance Metrics')
    axes[0, 0].set_ylabel('Score')
    axes[0, 0].set_ylim(0, 1)
    
    # Add value labels on bars
    for bar, value in zip(bars, main_metrics.values()):
        axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                       f'{value:.3f}', ha='center', va='bottom')
    
    # 2. ROUGE metrics breakdown
    rouge_metrics = {
        'ROUGE-1': eval_results.get('rouge1_f1', 0),
        'ROUGE-2': eval_results.get('rouge2_f1', 0),
        'ROUGE-L': eval_results.get('rougeL_f1', 0)
    }
    
    bars = axes[0, 1].bar(rouge_metrics.keys(), rouge_metrics.values(), 
                         color=['orange', 'yellow', 'lightgreen'])
    axes[0, 1].set_title('ROUGE Metrics Breakdown')
    axes[0, 1].set_ylabel('F1 Score')
    axes[0, 1].set_ylim(0, 1)
    
    for bar, value in zip(bars, rouge_metrics.values()):
        axes[0, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                       f'{value:.3f}', ha='center', va='bottom')
    
    # 3. Response length comparison
    pred_length = eval_results.get('avg_prediction_length', 0)
    ref_length = eval_results.get('avg_reference_length', 0)
    
    length_data = ['Predictions', 'References']
    length_values = [pred_length, ref_length]
    
    bars = axes[1, 0].bar(length_data, length_values, color=['purple', 'brown'])
    axes[1, 0].set_title('Average Response Length')
    axes[1, 0].set_ylabel('Words')
    
    for bar, value in zip(bars, length_values):
        axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                       f'{value:.1f}', ha='center', va='bottom')
    
    # 4. Performance radar chart (simplified as bar chart)
    performance_categories = {
        'Accuracy': min(eval_results.get('corpus_bleu', 0) * 2, 1),  # Scale BLEU
        'Fluency': min(eval_results.get('rougeL_f1', 0) * 1.5, 1),   # Scale ROUGE-L
        'Relevance': eval_results.get('semantic_similarity', 0),
        'Length': min(abs(1 - eval_results.get('length_ratio', 1)), 1)  # Length appropriateness
    }
    
    bars = axes[1, 1].bar(performance_categories.keys(), performance_categories.values(),
                         color=['red', 'blue', 'green', 'orange'])
    axes[1, 1].set_title('Performance Categories')
    axes[1, 1].set_ylabel('Score')
    axes[1, 1].set_ylim(0, 1)
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    for bar, value in zip(bars, performance_categories.values()):
        axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                       f'{value:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("✅ Visualizations created!")
    
else:
    print("⚠️ No evaluation results to visualize")

## 🗣️ Step 3: Qualitative Evaluation

Let's test your chatbot with real conversations and see how it performs!

In [None]:
# Comprehensive test questions
if model_loaded:
    print("🗣️ QUALITATIVE EVALUATION")
    print("="*35)
    
    # Comprehensive test questions covering different medical areas
    test_questions = [
        # Basic symptoms
        "What are the symptoms of diabetes?",
        "How do I know if I have high blood pressure?",
        "What causes chest pain?",
        
        # Treatment and medication
        "How is pneumonia treated?",
        "What are the side effects of aspirin?",
        "How long should I take antibiotics?",
        
        # Prevention and wellness
        "How can I prevent heart disease?",
        "What foods boost immunity?",
        "How much exercise do I need daily?",
        
        # Emergency situations
        "What should I do if someone has a heart attack?",
        "When should I go to the emergency room?",
        
        # Out-of-domain tests
        "What's the weather like today?",
        "Can you help me with my math homework?",
        "What's the capital of France?"
    ]
    
    print(f"🧪 Testing with {len(test_questions)} questions...")
    print("\n🤖 CHATBOT RESPONSES:")
    print("="*50)
    
    qualitative_results = []
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i:2d}. 👤 User: {question}")
        
        try:
            response = chatbot.chat(question)
            answer = response['response']
            response_type = response['type']
            
            # Determine if this is a healthcare question
            is_medical = response_type == 'healthcare'
            
            # Color coding for response types
            if response_type == 'healthcare':
                icon = "🏥"
            elif response_type == 'out_of_domain':
                icon = "⚠️"
            else:
                icon = "💬"
            
            print(f"    {icon} Bot: {answer}")
            print(f"    📊 Type: {response_type}")
            
            # Store result for analysis
            qualitative_results.append({
                'question': question,
                'answer': answer,
                'type': response_type,
                'is_medical': is_medical,
                'is_out_of_domain': 'weather' in question.lower() or 'math' in question.lower() or 'capital' in question.lower()
            })
            
        except Exception as e:
            print(f"    ❌ Error: {e}")
            qualitative_results.append({
                'question': question,
                'answer': f"Error: {e}",
                'type': 'error',
                'is_medical': False,
                'is_out_of_domain': False
            })
        
        print("-" * 50)
    
    qualitative_completed = True
    
else:
    print("⚠️ Cannot run qualitative evaluation - model not loaded")
    qualitative_completed = False
    qualitative_results = []

In [None]:
# Analyze qualitative results
if qualitative_completed and qualitative_results:
    print("📊 QUALITATIVE ANALYSIS")
    print("="*35)
    
    # Calculate response type accuracy
    medical_questions = [r for r in qualitative_results if not r['is_out_of_domain'] and r['type'] != 'error']
    out_of_domain_questions = [r for r in qualitative_results if r['is_out_of_domain']]
    
    # Medical question handling
    medical_correct = sum(1 for r in medical_questions if r['is_medical'])
    medical_total = len(medical_questions)
    medical_accuracy = medical_correct / medical_total if medical_total > 0 else 0
    
    # Out-of-domain handling
    ood_correct = sum(1 for r in out_of_domain_questions if not r['is_medical'])
    ood_total = len(out_of_domain_questions)
    ood_accuracy = ood_correct / ood_total if ood_total > 0 else 0
    
    # Response length analysis
    response_lengths = [len(r['answer'].split()) for r in qualitative_results if r['type'] != 'error']
    avg_length = np.mean(response_lengths) if response_lengths else 0
    
    print("📈 RESPONSE ANALYSIS:")
    print("-"*25)
    print(f"Medical Question Accuracy: {medical_accuracy:.1%} ({medical_correct}/{medical_total})")
    print(f"Out-of-Domain Handling: {ood_accuracy:.1%} ({ood_correct}/{ood_total})")
    print(f"Average Response Length: {avg_length:.1f} words")
    
    # Overall assessment
    overall_accuracy = (medical_accuracy + ood_accuracy) / 2
    print(f"\n🎯 Overall Conversational Accuracy: {overall_accuracy:.1%}")
    
    # Quality assessment
    print("\n🔍 QUALITY ASSESSMENT:")
    print("-"*25)
    
    if medical_accuracy >= 0.8:
        print("🟢 Medical Response Quality: Excellent")
    elif medical_accuracy >= 0.6:
        print("🟡 Medical Response Quality: Good")
    else:
        print("🔴 Medical Response Quality: Needs Improvement")
    
    if ood_accuracy >= 0.8:
        print("🟢 Out-of-Domain Handling: Excellent")
    elif ood_accuracy >= 0.6:
        print("🟡 Out-of-Domain Handling: Good")
    else:
        print("🔴 Out-of-Domain Handling: Needs Improvement")
    
    if 20 <= avg_length <= 80:
        print("🟢 Response Length: Appropriate")
    elif avg_length < 20:
        print("🟡 Response Length: Too Short")
    else:
        print("🟡 Response Length: Too Long")
    
    # Create summary visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Response type distribution
    type_counts = {}
    for result in qualitative_results:
        response_type = result['type']
        type_counts[response_type] = type_counts.get(response_type, 0) + 1
    
    ax1.pie(type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%', startangle=90)
    ax1.set_title('Response Type Distribution')
    
    # Accuracy comparison
    accuracies = ['Medical Questions', 'Out-of-Domain']
    accuracy_values = [medical_accuracy, ood_accuracy]
    colors = ['lightblue', 'lightcoral']
    
    bars = ax2.bar(accuracies, accuracy_values, color=colors)
    ax2.set_title('Response Accuracy')
    ax2.set_ylabel('Accuracy')
    ax2.set_ylim(0, 1)
    
    # Add percentage labels
    for bar, value in zip(bars, accuracy_values):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                f'{value:.1%}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠️ No qualitative results to analyze")

## 🎯 Step 4: Performance Summary and Recommendations

Let's summarize your model's performance and provide improvement suggestions.

In [None]:
# Generate comprehensive performance report
if model_loaded:
    print("📋 COMPREHENSIVE PERFORMANCE REPORT")
    print("="*50)
    
    # Collect all metrics
    report = {
        'model_info': {
            'model_path': MODEL_PATH,
            'model_name': os.path.basename(os.path.dirname(MODEL_PATH)),
            'evaluation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
    }
    
    # Add quantitative metrics if available
    if quantitative_completed:
        report['quantitative_metrics'] = {
            'bleu_score': eval_results.get('corpus_bleu', 0),
            'rouge_l_f1': eval_results.get('rougeL_f1', 0),
            'semantic_similarity': eval_results.get('semantic_similarity', 0),
            'perplexity': eval_results.get('perplexity', 0)
        }
    
    # Add qualitative metrics if available
    if qualitative_completed:
        report['qualitative_metrics'] = {
            'medical_accuracy': medical_accuracy,
            'ood_accuracy': ood_accuracy,
            'avg_response_length': avg_length,
            'overall_accuracy': overall_accuracy
        }
    
    # Performance summary
    print("🏆 PERFORMANCE SUMMARY:")
    print("-"*30)
    
    if quantitative_completed:
        bleu = eval_results.get('corpus_bleu', 0)
        rouge = eval_results.get('rougeL_f1', 0)
        similarity = eval_results.get('semantic_similarity', 0)
        
        print(f"📊 Quantitative Score: {(bleu + rouge + similarity)/3:.3f}")
        print(f"   - BLEU Score: {bleu:.3f}")
        print(f"   - ROUGE-L F1: {rouge:.3f}")
        print(f"   - Semantic Similarity: {similarity:.3f}")
    
    if qualitative_completed:
        print(f"\n🗣️ Qualitative Score: {overall_accuracy:.3f}")
        print(f"   - Medical Question Handling: {medical_accuracy:.1%}")
        print(f"   - Out-of-Domain Detection: {ood_accuracy:.1%}")
    
    # Overall grade
    if quantitative_completed and qualitative_completed:
        quant_score = (bleu + rouge + similarity) / 3
        final_score = (quant_score + overall_accuracy) / 2
        
        print(f"\n🎯 FINAL SCORE: {final_score:.3f}")
        
        if final_score >= 0.4:
            grade = "A (Excellent)"
            emoji = "🏆"
        elif final_score >= 0.3:
            grade = "B (Good)"
            emoji = "👍"
        elif final_score >= 0.2:
            grade = "C (Fair)"
            emoji = "👌"
        else:
            grade = "D (Needs Improvement)"
            emoji = "🔧"
        
        print(f"🎓 GRADE: {grade} {emoji}")
        
        report['final_assessment'] = {
            'final_score': final_score,
            'grade': grade
        }
    
    # Improvement recommendations
    print("\n💡 IMPROVEMENT RECOMMENDATIONS:")
    print("-"*40)
    
    recommendations = []
    
    if quantitative_completed:
        if bleu < 0.2:
            recommendations.append("🔧 Low BLEU score - Consider more training epochs or larger model")
        if rouge < 0.3:
            recommendations.append("🔧 Low ROUGE score - Improve answer quality in training data")
        if similarity < 0.25:
            recommendations.append("🔧 Low similarity - Add more diverse training examples")
    
    if qualitative_completed:
        if medical_accuracy < 0.8:
            recommendations.append("🔧 Medical accuracy low - Fine-tune domain detection")
        if ood_accuracy < 0.8:
            recommendations.append("🔧 Out-of-domain handling poor - Improve rejection mechanism")
        if avg_length < 15:
            recommendations.append("🔧 Responses too short - Encourage longer, more detailed answers")
        elif avg_length > 100:
            recommendations.append("🔧 Responses too long - Encourage more concise answers")
    
    # General recommendations
    if not recommendations:
        recommendations.append("🎉 Great job! Your model is performing well.")
        recommendations.append("💡 Consider testing with more diverse medical questions")
        recommendations.append("🚀 Ready for deployment!")
    else:
        recommendations.extend([
            "📚 Consider adding more training data",
            "⚙️ Try different hyperparameters",
            "🤖 Experiment with different base models"
        ])
    
    for rec in recommendations:
        print(rec)
    
    report['recommendations'] = recommendations
    
    # Save report
    report_path = '/workspace/notebooks/evaluation_report.json'
    try:
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2, default=str)
        print(f"\n💾 Evaluation report saved to: {report_path}")
    except Exception as e:
        print(f"⚠️ Could not save report: {e}")
    
else:
    print("⚠️ Cannot generate report - model not loaded")

## 🚀 Step 5: Next Steps

Congratulations on evaluating your healthcare chatbot! Here's what to do next:

In [None]:
print("🎉 EVALUATION COMPLETE!")
print("="*40)

if model_loaded:
    print(f"✅ Successfully evaluated your healthcare chatbot!")
    print(f"🤖 Model: {os.path.basename(os.path.dirname(MODEL_PATH))}")
    
    print("\n🚀 NEXT STEPS:")
    print("-"*20)
    
    print("1. 🌐 Deploy Your Chatbot:")
    print("   → Open: notebooks/05_Deployment.ipynb")
    print("   → Launch web interface for others to use")
    
    print("\n2. 🖥️ Quick Testing:")
    print(f"   → CLI: python -m src.chatbot {MODEL_PATH}")
    print(f"   → Web: python -m src.web_interface --model_path {MODEL_PATH}")
    
    print("\n3. 🔧 Model Improvement (if needed):")
    if 'final_score' in locals() and final_score < 0.3:
        print("   → Retrain with more epochs")
        print("   → Try a larger model (dialogpt-medium)")
        print("   → Add more training data")
        print("   → Adjust hyperparameters")
    else:
        print("   → Your model is performing well!")
        print("   → Consider fine-tuning for specific use cases")
        print("   → Test with real users for feedback")
    
    print("\n4. 📊 Share Your Results:")
    print("   → Show the evaluation visualizations")
    print("   → Demonstrate the chatbot to others")
    print("   → Document your findings")
    
else:
    print("❌ Evaluation was not completed successfully.")
    print("\n🔧 TROUBLESHOOTING:")
    print("-"*25)
    print("1. Make sure you have a trained model")
    print("2. Check model path is correct")
    print("3. Verify model files exist")
    print("4. Try retraining if model is corrupted")

print("\n📚 RESOURCES:")
print("-"*15)
print("• Evaluation Report: evaluation_report.json")
print("• Deployment Guide: 05_Deployment.ipynb")
print("• Documentation: README.md")
print("• Improvement Tips: KAGGLE_DATASET_GUIDE.md")

print("\n🏥 Your AI healthcare assistant is ready to help patients! 🤖")