In [None]:
"""
# MediQuery AI - Data Exploration and Analysis

This notebook explores medical datasets and performs initial analysis for our multimodal healthcare AI system.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import requests
import json
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📊 MediQuery AI - Data Exploration Notebook")
print("=" * 50)

# ## 1. Dataset Overview

# Create sample medical data for demonstration
def create_sample_medical_data():
    """Create sample medical datasets for exploration"""

    # Sample PubMed papers data
    papers_data = {
        'title': [
            'AI in Medical Imaging: Current Applications and Future Prospects',
            'Deep Learning for Electronic Health Records Analysis',
            'Natural Language Processing in Clinical Decision Support',
            'Computer Vision Applications in Radiology Diagnostics',
            'Multimodal AI Systems for Healthcare: A Comprehensive Review',
            'Machine Learning in Drug Discovery and Development',
            'Federated Learning for Healthcare Data Privacy',
            'Transformer Models for Medical Text Analysis',
            'Automated Medical Image Segmentation Using CNNs',
            'AI-Powered Clinical Trial Optimization Strategies'
        ],
        'journal': [
            'Nature Medicine', 'JAMA', 'New England Journal of Medicine',
            'Radiology', 'Nature Digital Medicine', 'Science Translational Medicine',
            'The Lancet Digital Health', 'Journal of Medical Internet Research',
            'Medical Image Analysis', 'Nature Biotechnology'
        ],
        'year': [2024, 2023, 2024, 2023, 2024, 2023, 2024, 2023, 2024, 2023],
        'citations': [156, 89, 203, 134, 245, 112, 78, 167, 91, 188],
        'impact_factor': [53.4, 51.3, 91.2, 7.9, 14.8, 17.1, 23.1, 5.4, 8.9, 43.1],
        'category': [
            'Medical Imaging', 'EHR Analysis', 'Clinical NLP', 'Radiology AI',
            'Multimodal AI', 'Drug Discovery', 'Privacy', 'Text Analysis',
            'Image Segmentation', 'Clinical Trials'
        ]
    }

    # Sample medical images metadata
    images_data = {
        'image_id': [f'IMG_{i:04d}' for i in range(100)],
        'modality': np.random.choice(['X-ray', 'MRI', 'CT', 'Ultrasound'], 100),
        'body_part': np.random.choice(['Chest', 'Brain', 'Abdomen', 'Heart', 'Lung'], 100),
        'resolution': np.random.choice(['512x512', '1024x1024', '256x256'], 100),
        'file_size_mb': np.random.uniform(0.5, 15.0, 100).round(2),
        'quality_score': np.random.uniform(0.6, 1.0, 100).round(3),
        'has_annotation': np.random.choice([True, False], 100),
        'diagnosis': np.random.choice(['Normal', 'Abnormal', 'Uncertain'], 100)
    }

    # Sample clinical data
    clinical_data = {
        'patient_id': [f'PT_{i:05d}' for i in range(500)],
        'age': np.random.randint(18, 90, 500),
        'gender': np.random.choice(['Male', 'Female'], 500),
        'diagnosis_category': np.random.choice([
            'Cardiovascular', 'Respiratory', 'Neurological',
            'Oncology', 'Orthopedic', 'Endocrine'
        ], 500),
        'treatment_duration_days': np.random.randint(1, 365, 500),
        'outcome': np.random.choice(['Improved', 'Stable', 'Deteriorated'], 500),
        'ai_assisted': np.random.choice([True, False], 500)
    }

    return pd.DataFrame(papers_data), pd.DataFrame(images_data), pd.DataFrame(clinical_data)

# Generate sample datasets
papers_df, images_df, clinical_df = create_sample_medical_data()

print("📚 Literature Dataset:")
print(f"  • Papers: {len(papers_df)}")
print(f"  • Journals: {papers_df['journal'].nunique()}")
print(f"  • Years: {papers_df['year'].min()}-{papers_df['year'].max()}")
print()

print("🖼️ Medical Images Dataset:")
print(f"  • Images: {len(images_df)}")
print(f"  • Modalities: {list(images_df['modality'].unique())}")
print(f"  • Body Parts: {list(images_df['body_part'].unique())}")
print()

print("🏥 Clinical Dataset:")
print(f"  • Patients: {len(clinical_df)}")
print(f"  • Age Range: {clinical_df['age'].min()}-{clinical_df['age'].max()}")
print(f"  • Diagnoses: {list(clinical_df['diagnosis_category'].unique())}")
print()

# ## 2. Literature Analysis

def analyze_literature_trends():
    """Analyze trends in medical literature"""

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('📊 Medical Literature Analysis', fontsize=16, fontweight='bold')

    # Publications by year
    year_counts = papers_df['year'].value_counts().sort_index()
    axes[0, 0].bar(year_counts.index, year_counts.values, color='steelblue', alpha=0.8)
    axes[0, 0].set_title('Publications by Year')
    axes[0, 0].set_xlabel('Year')
    axes[0, 0].set_ylabel('Number of Papers')

    # Citations vs Impact Factor
    axes[0, 1].scatter(papers_df['impact_factor'], papers_df['citations'],
                      c=papers_df['year'], cmap='viridis', alpha=0.7, s=60)
    axes[0, 1].set_title('Citations vs Journal Impact Factor')
    axes[0, 1].set_xlabel('Impact Factor')
    axes[0, 1].set_ylabel('Citations')

    # Research categories
    category_counts = papers_df['category'].value_counts()
    axes[1, 0].barh(category_counts.index, category_counts.values, color='coral', alpha=0.8)
    axes[1, 0].set_title('Research Categories')
    axes[1, 0].set_xlabel('BLEU Score')
    axes[1, 0].set_ylabel('ROUGE-L Score')

    # Training time comparison
    axes[1, 1].barh(qa_df['model_name'], qa_df['training_time_hrs'], color='gold', alpha=0.8)
    axes[1, 1].set_title('Training Time Comparison')
    axes[1, 1].set_xlabel('Training Time (hours)')

    plt.tight_layout()
    plt.show()

evaluate_qa_models()

# ## 5. Cross-Model Performance Analysis

def cross_model_analysis():
    """Compare performance across different model types"""

    print("\n🔬 Cross-Model Performance Analysis")
    print("=" * 40)

    # Best performing models
    best_text_model = text_df.loc[text_df['accuracy'].idxmax()]
    best_image_model = image_df.loc[image_df['accuracy'].idxmax()]
    best_qa_model = qa_df.loc[qa_df['f1_score'].idxmax()]

    print(f"\n🏆 Best Performing Models:")
    print(f"  • Text Classification: {best_text_model['model_name']} (Acc: {best_text_model['accuracy']:.3f})")
    print(f"  • Image Classification: {best_image_model['model_name']} (Acc: {best_image_model['accuracy']:.3f})")
    print(f"  • Question Answering: {best_qa_model['model_name']} (F1: {best_qa_model['f1_score']:.3f})")

    # Efficiency analysis
    print(f"\n⚡ Efficiency Leaders:")
    fastest_text = text_df.loc[text_df['inference_time_ms'].idxmin()]
    fastest_image = image_df.loc[image_df['inference_time_ms'].idxmin()]
    fastest_qa = qa_df.loc[qa_df['inference_time_ms'].idxmin()]

    print(f"  • Fastest Text Model: {fastest_text['model_name']} ({fastest_text['inference_time_ms']}ms)")
    print(f"  • Fastest Image Model: {fastest_image['model_name']} ({fastest_image['inference_time_ms']}ms)")
    print(f"  • Fastest QA Model: {fastest_qa['model_name']} ({fastest_qa['inference_time_ms']}ms)")

    # Training efficiency
    print(f"\n📚 Training Efficiency:")
    efficient_text = text_df.loc[(text_df['accuracy'] / text_df['training_time_hrs']).idxmax()]
    efficient_image = image_df.loc[(image_df['accuracy'] / image_df['training_time_hrs']).idxmax()]
    efficient_qa = qa_df.loc[(qa_df['f1_score'] / qa_df['training_time_hrs']).idxmax()]

    print(f"  • Most Efficient Text: {efficient_text['model_name']}")
    print(f"  • Most Efficient Image: {efficient_image['model_name']}")
    print(f"  • Most Efficient QA: {efficient_qa['model_name']}")

cross_model_analysis()

# ## 6. Confusion Matrix Analysis

def generate_confusion_matrices():
    """Generate confusion matrices for model evaluation"""

    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    fig.suptitle('🎯 Confusion Matrix Analysis', fontsize=16, fontweight='bold')

    # Mock confusion matrices
    np.random.seed(42)

    # Text classification confusion matrix (4 classes)
    text_classes = ['Research Paper', 'Clinical Note', 'Drug Info', 'Diagnostic Report']
    text_cm = np.array([[145, 8, 3, 2], [5, 138, 7, 1], [2, 4, 142, 6], [1, 2, 5, 147]])

    sns.heatmap(text_cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=text_classes, yticklabels=text_classes, ax=axes[0])
    axes[0].set_title('Text Classification\n(Document Types)')
    axes[0].set_xlabel('Predicted')
    axes[0].set_ylabel('Actual')

    # Image classification confusion matrix (4 modalities)
    image_classes = ['X-ray', 'MRI', 'CT', 'Ultrasound']
    image_cm = np.array([[89, 3, 2, 1], [2, 91, 4, 0], [1, 5, 87, 2], [0, 1, 3, 92]])

    sns.heatmap(image_cm, annot=True, fmt='d', cmap='Oranges',
                xticklabels=image_classes, yticklabels=image_classes, ax=axes[1])
    axes[1].set_title('Image Classification\n(Modality Types)')
    axes[1].set_xlabel('Predicted')
    axes[1].set_ylabel('Actual')

    # QA performance by question type
    qa_classes = ['Definition', 'Procedure', 'Diagnosis', 'Treatment']
    qa_scores = np.array([[0.82, 0.15, 0.02, 0.01], [0.08, 0.78, 0.12, 0.02],
                         [0.03, 0.09, 0.85, 0.03], [0.01, 0.05, 0.08, 0.86]])

    sns.heatmap(qa_scores, annot=True, fmt='.2f', cmap='Greens',
                xticklabels=qa_classes, yticklabels=qa_classes, ax=axes[2])
    axes[2].set_title('QA Performance\n(Question Types)')
    axes[2].set_xlabel('Predicted Type')
    axes[2].set_ylabel('Actual Type')

    plt.tight_layout()
    plt.show()

generate_confusion_matrices()

# ## 7. Model Robustness Testing

def robustness_analysis():
    """Analyze model robustness across different conditions"""

    print("\n🛡️ Model Robustness Analysis")
    print("=" * 35)

    # Simulate robustness testing results
    np.random.seed(42)

    conditions = ['Clean Data', 'Noisy Data', 'Low Quality', 'Domain Shift', 'Adversarial']

    # Text model robustness
    text_robustness = {
        'BioBERT': [0.924, 0.887, 0.854, 0.798, 0.723],
        'PubMedBERT': [0.937, 0.901, 0.867, 0.812, 0.745],
        'ClinicalBERT': [0.911, 0.876, 0.841, 0.789, 0.708]
    }

    # Image model robustness
    image_robustness = {
        'Vision Transformer': [0.941, 0.889, 0.823, 0.756, 0.634],
        'EfficientNet-B4': [0.923, 0.878, 0.831, 0.782, 0.687],
        'ResNet50': [0.887, 0.845, 0.798, 0.743, 0.661]
    }

    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('🛡️ Model Robustness Under Different Conditions', fontsize=14, fontweight='bold')

    # Text model robustness
    for model, scores in text_robustness.items():
        axes[0].plot(conditions, scores, marker='o', linewidth=2, label=model)

    axes[0].set_title('Text Models Robustness')
    axes[0].set_ylabel('Accuracy')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Image model robustness
    for model, scores in image_robustness.items():
        axes[1].plot(conditions, scores, marker='s', linewidth=2, label=model)

    axes[1].set_title('Image Models Robustness')
    axes[1].set_ylabel('Accuracy')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Print robustness summary
    print("\n📊 Robustness Summary:")
    print("  • All models show degradation under adverse conditions")
    print("  • Vision Transformer most robust to clean data")
    print("  • PubMedBERT shows best text robustness overall")
    print("  • Adversarial examples cause 20-30% performance drop")

robustness_analysis()

# ## 8. Computational Requirements

def analyze_computational_requirements():
    """Analyze computational requirements of different models"""

    # Mock computational data
    comp_data = {
        'Model Type': ['Text (BioBERT)', 'Text (PubMedBERT)', 'Image (ViT)', 'Image (EfficientNet)', 'QA (RoBERTa)'],
        'Parameters (M)': [110, 110, 86, 19, 125],
        'Memory (GB)': [1.2, 1.2, 2.1, 0.8, 1.4],
        'GPU Hours': [2.3, 2.7, 8.3, 6.1, 5.1],
        'Inference (ms)': [45, 52, 67, 31, 78],
        'Energy (kWh)': [4.2, 4.8, 15.6, 11.2, 9.3]
    }

    comp_df = pd.DataFrame(comp_data)

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('💻 Computational Requirements Analysis', fontsize=16, fontweight='bold')

    # Parameters vs Memory
    scatter = axes[0, 0].scatter(comp_df['Parameters (M)'], comp_df['Memory (GB)'],
                                s=100, alpha=0.7, c=comp_df['GPU Hours'], cmap='viridis')
    axes[0, 0].set_title('Model Size vs Memory Usage')
    axes[0, 0].set_xlabel('Parameters (Millions)')
    axes[0, 0].set_ylabel('Memory (GB)')
    plt.colorbar(scatter, ax=axes[0, 0], label='GPU Hours')

    # Training time vs model type
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
    axes[0, 1].bar(comp_df['Model Type'], comp_df['GPU Hours'], color=colors, alpha=0.8)
    axes[0, 1].set_title('Training Time by Model Type')
    axes[0, 1].set_ylabel('GPU Hours')
    axes[0, 1].tick_params(axis='x', rotation=45)

    # Inference speed comparison
    axes[1, 0].barh(comp_df['Model Type'], comp_df['Inference (ms)'], color='lightcoral', alpha=0.8)
    axes[1, 0].set_title('Inference Speed Comparison')
    axes[1, 0].set_xlabel('Inference Time (ms)')

    # Energy consumption
    axes[1, 1].pie(comp_df['Energy (kWh)'], labels=comp_df['Model Type'], autopct='%1.1f%%')
    axes[1, 1].set_title('Energy Consumption Distribution')

    plt.tight_layout()
    plt.show()

    print("\n⚡ Computational Insights:")
    print(f"  • Most Parameter-Efficient: {comp_df.loc[comp_df['Parameters (M)'].idxmin(), 'Model Type']}")
    print(f"  • Fastest Inference: {comp_df.loc[comp_df['Inference (ms)'].idxmin(), 'Model Type']}")
    print(f"  • Most Memory Efficient: {comp_df.loc[comp_df['Memory (GB)'].idxmin(), 'Model Type']}")
    print(f"  • Lowest Energy Usage: {comp_df.loc[comp_df['Energy (kWh)'].idxmin(), 'Model Type']}")

analyze_computational_requirements()

# ## 9. Model Recommendations

def generate_model_recommendations():
    """Generate recommendations for different use cases"""

    print("\n🎯 Model Recommendations by Use Case")
    print("=" * 45)

    recommendations = {
        "High Accuracy Requirements": {
            "Text": "PubMedBERT (93.7% accuracy)",
            "Image": "Vision Transformer (94.1% accuracy)",
            "QA": "PubMedBERT-QA (83.1% F1)",
            "Rationale": "Best overall performance, suitable for critical applications"
        },

        "Real-time Applications": {
            "Text": "SciBERT (35ms inference)",
            "Image": "ResNet50 (23ms inference)",
            "QA": "BioBERT-QA (71ms inference)",
            "Rationale": "Optimized for speed, acceptable accuracy trade-off"
        },

        "Resource Constrained": {
            "Text": "ClinicalBERT (1.8hr training)",
            "Image": "DenseNet121 (7.9M parameters)",
            "QA": "BioBERT-QA (4.7hr training)",
            "Rationale": "Efficient training and deployment, lower compute requirements"
        },

        "Production Deployment": {
            "Text": "BioBERT (balanced performance)",
            "Image": "EfficientNet-B4 (good accuracy/speed balance)",
            "QA": "RoBERTa-BioASQ (proven track record)",
            "Rationale": "Reliable, well-tested, good performance-efficiency balance"
        }
    }

    for use_case, models in recommendations.items():
        print(f"\n🔹 {use_case}:")
        print(f"  • Text: {models['Text']}")
        print(f"  • Image: {models['Image']}")
        print(f"  • QA: {models['QA']}")
        print(f"  • Why: {models['Rationale']}")

generate_model_recommendations()

# ## 10. Export Evaluation Results

def export_evaluation_results():
    """Export model evaluation results"""

    # Combine all results
    evaluation_results = {
        'evaluation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
        'text_classification': text_df.to_dict('records'),
        'image_classification': image_df.to_dict('records'),
        'question_answering': qa_df.to_dict('records'),
        'best_models': {
            'text': text_df.loc[text_df['accuracy'].idxmax()].to_dict(),
            'image': image_df.loc[image_df['accuracy'].idxmax()].to_dict(),
            'qa': qa_df.loc[qa_df['f1_score'].idxmax()].to_dict()
        },
        'summary_metrics': {
            'avg_text_accuracy': text_df['accuracy'].mean(),
            'avg_image_accuracy': image_df['accuracy'].mean(),
            'avg_qa_f1': qa_df['f1_score'].mean(),
            'total_models_evaluated': len(text_df) + len(image_df) + len(qa_df)
        }
    }

    # Save results
    import json
    with open('../results/model_evaluation_results.json', 'w') as f:
        json.dump(evaluation_results, f, indent=2, default=str)

    print("\n💾 Evaluation Results Exported!")
    print("Results saved to: ../results/model_evaluation_results.json")

    return evaluation_results

evaluation_results = export_evaluation_results()

print("\n✅ Model Evaluation Complete!")
print("Key Findings:")
print("  • PubMedBERT leads text classification (93.7% accuracy)")
print("  • Vision Transformer best for images (94.1% accuracy)")
print("  • RoBERTa-BioASQ excels in QA (83.1% F1 score)")
print("  • Trade-offs exist between accuracy and efficiency")