# 📊 Healthcare Chatbot - Data Exploration

In this notebook, you'll:
1. Upload and explore your Kaggle ai-medical-chatbot dataset
2. Analyze data quality and statistics
3. Visualize the data distribution
4. Prepare data for training

Let's dive into your medical data! 🔍

## 📁 Step 1: Dataset Upload and Detection

First, let's help you upload and detect your Kaggle dataset.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('/workspace/src')

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("🔧 Environment setup complete!")

In [None]:
# Look for existing datasets
def find_datasets():
    """Find potential dataset files in the workspace."""
    workspace = Path('/workspace')
    
    # Common dataset file patterns
    patterns = ['*.csv', '*.json', '*.jsonl']
    datasets = []
    
    for pattern in patterns:
        datasets.extend(workspace.glob(pattern))
        datasets.extend(workspace.glob(f'**/{pattern}'))
    
    # Filter out known non-dataset files
    exclude = ['requirements.txt', 'package.json', 'test_']
    datasets = [d for d in datasets if not any(ex in d.name for ex in exclude)]
    
    return sorted(set(datasets))

print("🔍 Searching for datasets in workspace...")
found_datasets = find_datasets()

if found_datasets:
    print(f"📁 Found {len(found_datasets)} potential dataset files:")
    for i, dataset in enumerate(found_datasets, 1):
        size_mb = dataset.stat().st_size / (1024 * 1024)
        print(f"   {i}. {dataset.name} ({size_mb:.1f} MB) - {dataset.parent}")
    
    print("\n💡 If one of these is your Kaggle dataset, note the number for the next step.")
else:
    print("📂 No datasets found in workspace.")
    print("\n📤 Please upload your Kaggle ai-medical-chatbot dataset file to the workspace.")
    print("   Common names: ai-medical-chatbot.csv, medical_data.csv, healthcare_qa.json")

In [None]:
# Select your dataset
# MODIFY THIS: Set the path to your Kaggle dataset
DATASET_PATH = None  # e.g., '/workspace/ai-medical-chatbot.csv'

# If you found datasets above, you can select one:
if found_datasets:
    print("📋 Available datasets:")
    for i, dataset in enumerate(found_datasets, 1):
        print(f"   {i}. {dataset}")
    
    # Uncomment and modify the line below to select a dataset by number
    # DATASET_PATH = str(found_datasets[0])  # Change 0 to your dataset number - 1

# Or set the path directly
# DATASET_PATH = '/workspace/your-dataset-file.csv'

if DATASET_PATH and os.path.exists(DATASET_PATH):
    print(f"✅ Dataset selected: {DATASET_PATH}")
    print(f"📊 File size: {os.path.getsize(DATASET_PATH) / (1024*1024):.1f} MB")
else:
    print("❌ Please set DATASET_PATH to your Kaggle dataset file.")
    print("   Modify the DATASET_PATH variable in the cell above.")

## 🔍 Step 2: Load and Analyze Your Dataset

Now let's load your dataset and see what we're working with.

In [None]:
# Load the dataset using our Kaggle loader
if DATASET_PATH and os.path.exists(DATASET_PATH):
    from kaggle_data_loader import KaggleMedicalDataLoader
    
    print(f"📂 Loading dataset: {os.path.basename(DATASET_PATH)}")
    print("="*60)
    
    try:
        # Initialize loader
        loader = KaggleMedicalDataLoader(DATASET_PATH)
        
        # Detect format
        format_type = loader.detect_format()
        print(f"📁 Detected format: {format_type.upper()}")
        
        # Load raw data
        raw_data = loader.load_data()
        print(f"✅ Raw data loaded successfully!")
        
        # Show raw data info
        if isinstance(raw_data, pd.DataFrame):
            print(f"📋 DataFrame shape: {raw_data.shape}")
            print(f"📋 Columns: {list(raw_data.columns)}")
            print(f"📋 Data types:")
            for col, dtype in raw_data.dtypes.items():
                print(f"   {col}: {dtype}")
        elif isinstance(raw_data, list):
            print(f"📋 List with {len(raw_data)} items")
            if raw_data:
                print(f"📋 Sample keys: {list(raw_data[0].keys())}")
        
        print("\n✅ Dataset loaded successfully!")
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        raw_data = None
        loader = None
else:
    print("⚠️ Please set DATASET_PATH first in the cell above.")
    raw_data = None
    loader = None

In [None]:
# Preview raw data
if raw_data is not None:
    print("👀 RAW DATA PREVIEW")
    print("="*40)
    
    if isinstance(raw_data, pd.DataFrame):
        print("📊 First 3 rows:")
        display(raw_data.head(3))
        
        print("\n📊 Data info:")
        print(raw_data.info())
        
        # Check for missing values
        missing = raw_data.isnull().sum()
        if missing.any():
            print("\n⚠️ Missing values:")
            for col, count in missing[missing > 0].items():
                print(f"   {col}: {count} ({count/len(raw_data)*100:.1f}%)")
        else:
            print("\n✅ No missing values found!")
            
    elif isinstance(raw_data, list):
        print("📊 First 3 items:")
        for i, item in enumerate(raw_data[:3], 1):
            print(f"\nItem {i}:")
            for key, value in item.items():
                preview = str(value)[:100] + "..." if len(str(value)) > 100 else str(value)
                print(f"   {key}: {preview}")
else:
    print("⚠️ No data to preview. Please load your dataset first.")

## 🔧 Step 3: Process and Clean the Data

Now let's process the raw data into our standard Q&A format.

In [None]:
# Process the data
if loader is not None:
    print("🔧 PROCESSING DATA")
    print("="*30)
    
    try:
        # Process data to Q&A format
        processed_data = loader.process_data()
        
        print(f"✅ Data processed successfully!")
        print(f"📊 Processed {len(processed_data)} Q&A pairs")
        
        # Get statistics
        stats = loader.get_data_statistics()
        
        print("\n📈 DATASET STATISTICS:")
        print("-"*30)
        for key, value in stats.items():
            if isinstance(value, dict):
                print(f"{key}: {len(value)} categories")
                # Show top categories
                top_cats = list(value.items())[:5]
                for cat, count in top_cats:
                    print(f"   - {cat}: {count}")
                if len(value) > 5:
                    print(f"   ... and {len(value)-5} more")
            elif isinstance(value, float):
                print(f"{key}: {value:.2f}")
            else:
                print(f"{key}: {value}")
                
    except Exception as e:
        print(f"❌ Error processing data: {e}")
        processed_data = None
        stats = None
else:
    print("⚠️ No loader available. Please load your dataset first.")
    processed_data = None
    stats = None

In [None]:
# Preview processed data
if processed_data:
    print("👀 PROCESSED DATA PREVIEW")
    print("="*40)
    
    preview_samples = min(5, len(processed_data))
    
    for i, item in enumerate(processed_data[:preview_samples], 1):
        print(f"\n📝 Sample {i}:")
        print(f"   Q: {item['question']}")
        print(f"   A: {item['answer'][:150]}{'...' if len(item['answer']) > 150 else ''}")
        if 'category' in item:
            print(f"   Category: {item['category']}")
        print("-" * 50)
        
    if len(processed_data) > preview_samples:
        print(f"\n... and {len(processed_data) - preview_samples} more samples")
else:
    print("⚠️ No processed data to preview.")

## 📊 Step 4: Data Visualization and Analysis

Let's create some visualizations to better understand your dataset.

In [None]:
# Create visualizations
if processed_data and len(processed_data) > 0:
    print("📊 CREATING VISUALIZATIONS")
    print("="*40)
    
    # Extract data for analysis
    questions = [item['question'] for item in processed_data]
    answers = [item['answer'] for item in processed_data]
    categories = [item.get('category', 'Unknown') for item in processed_data]
    
    # Calculate lengths
    question_lengths = [len(q.split()) for q in questions]
    answer_lengths = [len(a.split()) for a in answers]
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Healthcare Dataset Analysis', fontsize=16, fontweight='bold')
    
    # 1. Question length distribution
    axes[0, 0].hist(question_lengths, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Question Length Distribution')
    axes[0, 0].set_xlabel('Number of Words')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].axvline(np.mean(question_lengths), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(question_lengths):.1f}')
    axes[0, 0].legend()
    
    # 2. Answer length distribution
    axes[0, 1].hist(answer_lengths, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[0, 1].set_title('Answer Length Distribution')
    axes[0, 1].set_xlabel('Number of Words')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].axvline(np.mean(answer_lengths), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(answer_lengths):.1f}')
    axes[0, 1].legend()
    
    # 3. Category distribution (if available)
    category_counts = Counter(categories)
    if len(category_counts) > 1 and 'Unknown' not in category_counts or len(category_counts) > 1:
        # Show top 10 categories
        top_categories = category_counts.most_common(10)
        cats, counts = zip(*top_categories)
        
        axes[1, 0].bar(range(len(cats)), counts, color='lightcoral')
        axes[1, 0].set_title('Top Medical Categories')
        axes[1, 0].set_xlabel('Categories')
        axes[1, 0].set_ylabel('Number of Q&A Pairs')
        axes[1, 0].set_xticks(range(len(cats)))
        axes[1, 0].set_xticklabels(cats, rotation=45, ha='right')
    else:
        axes[1, 0].text(0.5, 0.5, 'No category information\navailable', 
                        ha='center', va='center', transform=axes[1, 0].transAxes,
                        fontsize=12)
        axes[1, 0].set_title('Medical Categories')
    
    # 4. Question vs Answer length scatter
    axes[1, 1].scatter(question_lengths, answer_lengths, alpha=0.6, color='purple')
    axes[1, 1].set_title('Question vs Answer Length')
    axes[1, 1].set_xlabel('Question Length (words)')
    axes[1, 1].set_ylabel('Answer Length (words)')
    
    # Add correlation coefficient
    correlation = np.corrcoef(question_lengths, answer_lengths)[0, 1]
    axes[1, 1].text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
                    transform=axes[1, 1].transAxes, fontsize=10,
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
    
    print("✅ Visualizations created!")
    
else:
    print("⚠️ No data available for visualization.")

In [None]:
# Detailed statistics table
if processed_data:
    print("📊 DETAILED STATISTICS")
    print("="*40)
    
    # Create statistics DataFrame
    stats_data = {
        'Metric': [
            'Total Q&A Pairs',
            'Avg Question Length',
            'Avg Answer Length',
            'Min Question Length',
            'Max Question Length',
            'Min Answer Length',
            'Max Answer Length',
            'Unique Categories'
        ],
        'Value': [
            len(processed_data),
            f"{np.mean(question_lengths):.1f} words",
            f"{np.mean(answer_lengths):.1f} words",
            f"{min(question_lengths)} words",
            f"{max(question_lengths)} words",
            f"{min(answer_lengths)} words",
            f"{max(answer_lengths)} words",
            len(set(categories)) if categories else 'N/A'
        ]
    }
    
    stats_df = pd.DataFrame(stats_data)
    display(stats_df)
    
    # Quality assessment
    print("\n🔍 DATA QUALITY ASSESSMENT:")
    print("-"*30)
    
    # Check for very short questions/answers
    short_questions = sum(1 for q in question_lengths if q < 3)
    short_answers = sum(1 for a in answer_lengths if a < 5)
    
    quality_checks = [
        ("Questions too short (<3 words)", short_questions, short_questions == 0),
        ("Answers too short (<5 words)", short_answers, short_answers == 0),
        ("Dataset size adequate (>50 pairs)", len(processed_data), len(processed_data) > 50),
        ("Good variety (>5 categories)", len(set(categories)), len(set(categories)) > 5),
    ]
    
    for check, value, is_good in quality_checks:
        icon = "✅" if is_good else "⚠️"
        print(f"{icon} {check}: {value}")
    
    # Overall assessment
    good_checks = sum(1 for _, _, is_good in quality_checks if is_good)
    total_checks = len(quality_checks)
    
    print(f"\n📊 Overall Quality Score: {good_checks}/{total_checks} ({good_checks/total_checks*100:.0f}%)")
    
    if good_checks >= 3:
        print("🎉 Your dataset looks good for training!")
    elif good_checks >= 2:
        print("👍 Your dataset is usable but could be improved.")
    else:
        print("⚠️ Your dataset may need some improvements before training.")

## 💾 Step 5: Save Processed Dataset

Let's save the processed dataset for training.

In [None]:
# Save processed dataset
if processed_data and loader:
    print("💾 SAVING PROCESSED DATASET")
    print("="*40)
    
    # Define output path
    output_path = '/workspace/data/kaggle_medical_dataset.json'
    
    try:
        # Save processed data
        loader.save_processed_data(output_path)
        
        print(f"✅ Processed dataset saved to: {output_path}")
        print(f"📊 Saved {len(processed_data)} Q&A pairs")
        
        # Verify the saved file
        with open(output_path, 'r') as f:
            saved_data = json.load(f)
        
        print(f"✅ Verification: {len(saved_data)} pairs loaded from saved file")
        
        # Show file size
        file_size = os.path.getsize(output_path) / 1024  # KB
        print(f"📁 File size: {file_size:.1f} KB")
        
    except Exception as e:
        print(f"❌ Error saving dataset: {e}")
        output_path = None
        
else:
    print("⚠️ No processed data to save.")
    output_path = None

## 🎯 Step 6: Next Steps

Great! You've successfully explored your dataset. Here's what to do next:

In [None]:
print("🎯 DATA EXPLORATION COMPLETE!")
print("="*50)

if processed_data and output_path:
    print(f"✅ Successfully processed {len(processed_data)} medical Q&A pairs")
    print(f"💾 Data saved to: {output_path}")
    
    print("\n🚀 NEXT STEPS:")
    print("-"*20)
    print("1. 🎓 Train Your Model:")
    print("   → Open: notebooks/03_Model_Training.ipynb")
    print("   → Train a healthcare chatbot on your data")
    
    print("\n2. 🏃‍♂️ Quick Training (Alternative):")
    print(f"   → Run: python train_kaggle_chatbot.py --kaggle_dataset_path {DATASET_PATH}")
    
    print("\n3. 📊 Training Recommendations:")
    if len(processed_data) < 100:
        print("   → Use: --model_key distilgpt2 (faster for small datasets)")
        print("   → Use: --epochs 2 (fewer epochs for small data)")
    elif len(processed_data) < 1000:
        print("   → Use: --model_key dialogpt-medium (good balance)")
        print("   → Use: --epochs 3 (standard training)")
    else:
        print("   → Use: --model_key dialogpt-medium (best quality)")
        print("   → Use: --batch_size 2 (for large datasets)")
        print("   → Consider: --max_samples 2000 (to limit training time)")
    
    print("\n💡 TIP: Start with a quick test training first!")
    print(f"   python train_kaggle_chatbot.py \\")
    print(f"     --kaggle_dataset_path {DATASET_PATH} \\")
    print(f"     --epochs 1 --max_samples 100 --model_key distilgpt2")
    
else:
    print("❌ Data exploration incomplete.")
    print("\n🔧 TROUBLESHOOTING:")
    print("-"*20)
    print("1. Make sure DATASET_PATH is set correctly")
    print("2. Check that your dataset file exists and is readable")
    print("3. Verify your dataset has question and answer columns")
    print("4. Try uploading a different dataset file")

print("\n📚 RESOURCES:")
print("-"*15)
print("• Full Guide: KAGGLE_DATASET_GUIDE.md")
print("• Quick Start: YOUR_KAGGLE_DATASET_INSTRUCTIONS.md")
print("• Next Notebook: 03_Model_Training.ipynb")