### Install Import and Prepare

In [14]:
!pip install -q transformers accelerate nltk

import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # Additional download needed for newer NLTK versions


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bashi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bashi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [9]:
import json
import os

# Load the dataset from the local JSON file
json_file_path = r"c:\Users\bashi\Downloads\icliniqQAs.json"

# Check if file exists
if not os.path.exists(json_file_path):
    print(f"File not found: {json_file_path}")
    print("Please make sure the icliniqQAs.json file is in the correct location.")
else:
    with open(json_file_path, "r", encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Loaded {len(data)} entries from the dataset")
    
    # Check the structure of the first few entries
    print("Sample entries:")
    for i, entry in enumerate(data[:2]):
        print(f"Entry {i+1}: {list(entry.keys())}")
        print(f"Question: {entry['question']}")
        print(f"Answer: {entry['answer'][:100]}...")  # Show first 100 chars
        print("---")

# Prepare QA pairs for testing (use first 20 for quick evaluation)
qa_samples = [
    {
        'question': entry['question'],
        'answer': entry['answer']
    }
    for entry in data[:20]  # Use first 20 entries
]

print(f"\nPrepared {len(qa_samples)} Q&A pairs for evaluation")

Loaded 465 entries from the dataset
Sample entries:
Entry 1: ['answer', 'question', 'question_text', 'tags', 'url']
Question: is it fine to exercise with knee pain?
Answer: from your description it appears that you may have anterior knee pain which sometimes presents as pa...
---
Entry 2: ['answer', 'question', 'question_text', 'tags', 'url']
Question: suffering from anxiety restlessness and taking clonazepam & mirtazapine. i had depression & panic attacks for the last 7 years. need a second opinion.
Answer: depression anxiety restlessness and panic attacks are best respond to a combination of a selective s...
---

Prepared 20 Q&A pairs for evaluation


In [None]:

model_options = [
    ("microsoft/DialoGPT-medium", "DialoGPT Medium - Good for conversational AI"),
    ("gpt2", "GPT-2 - Classic generative model"),
    ("distilgpt2", "DistilGPT-2 - Smaller and faster version of GPT-2"),
    ("microsoft/DialoGPT-small", "DialoGPT Small - Faster but smaller model")
]

# Let's use GPT-2 as it's reliable and doesn't require authentication
model_name = "gpt2"

print(f"Loading model: {model_name}")
print("This model doesn't require authentication and should work out of the box.")
print("Note: For better medical QA performance, you'd want to use Llama 3.2 with proper HF authentication.")

try:
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Set pad token if not available
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with appropriate settings
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        low_cpu_mem_usage=True
    )
    
    print(f"Model loaded successfully!")
    print(f"Model device: {next(model.parameters()).device}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("You may need to:")
    print("1. Install torch with CUDA support if you have a GPU")
    print("2. Ensure you have enough RAM/VRAM")
    print("3. Check your internet connection")
    
# Alternative: Instructions for Llama 3.2
print("\n" + "="*60)
print("To use Llama 3.2 (recommended for better performance):")
print("1. Create a Hugging Face account at https://huggingface.co/")
print("2. Request access to the Llama 3.2 model")
print("3. Install huggingface_hub: pip install huggingface_hub")
print("4. Login: huggingface-cli login")
print("5. Then change model_name to 'meta-llama/Llama-3.2-1B'")
print("="*60)

Loading model: gpt2
This model doesn't require authentication and should work out of the box.
Note: For better medical QA performance, you'd want to use Llama 3.2 with proper HF authentication.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model loaded successfully!
Model device: cpu
CUDA available: False
Model parameters: 124,439,808

To use Llama 3.2 (recommended for better performance):
1. Create a Hugging Face account at https://huggingface.co/
2. Request access to the Llama 3.2 model
3. Install huggingface_hub: pip install huggingface_hub
4. Login: huggingface-cli login
5. Then change model_name to 'meta-llama/Llama-3.2-1B'


In [12]:
def ask_llama(model, tokenizer, question, max_new_tokens=150):
    """
    Generate an answer to a medical question using Llama model
    """
    # Create a more specific medical prompt
    prompt = f"""You are a helpful medical assistant. Answer the following medical question concisely and accurately.

Question: {question}
Answer:"""
    
    try:
        # Tokenize the input
        input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Move to the same device as model
        if hasattr(model, 'device'):
            input_ids = {k: v.to(model.device) for k, v in input_ids.items()}
        
        # Generate response with better parameters
        with torch.no_grad():
            output_ids = model.generate(
                **input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1
            )
        
        # Decode the response
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        # Extract only the answer part
        if "Answer:" in generated_text:
            response = generated_text.split("Answer:")[-1].strip()
        else:
            response = generated_text[len(prompt):].strip()
        
        return response
        
    except Exception as e:
        print(f"Error generating response for question: {question[:50]}...")
        print(f"Error: {e}")
        return f"Error: Could not generate response"

# Test the function with one sample
if 'qa_samples' in locals() and len(qa_samples) > 0:
    print("Testing the model with a sample question...")
    test_question = qa_samples[0]['question']
    test_answer = ask_llama(model, tokenizer, test_question)
    print(f"Question: {test_question}")
    print(f"Generated Answer: {test_answer}")
    print("---")

# Generate answers for all samples
print(f"Generating answers for {len(qa_samples)} questions...")
results = []

for i, sample in enumerate(qa_samples):
    print(f"Processing question {i+1}/{len(qa_samples)}...")
    model_answer = ask_llama(model, tokenizer, sample['question'])
    results.append({
        'question': sample['question'], 
        'reference': sample['answer'], 
        'model_answer': model_answer
    })

print(f"Completed generating {len(results)} answers!")

Testing the model with a sample question...
Question: is it fine to exercise with knee pain?
Generated Answer: Yes, you may want surgery if your knees get hurt during this time of year when they should be at their optimal level for healing or recovery from arthritis disease (arthritis). You can try not using any other medications as well but don't overdo them unless there is good evidence that taking antibiotics will heal all aspects in one day! For example, I have been suffering joint problems since early February so far which has made me feel better than ever before because my back hurts quite much more now after having an operation last week - especially considering how hard work goes on everyday here...

 I've tried multiple different drugs daily while trying various treatments including Tylenol's anti-inflammatory medication Seroquel Zoster/Cortisone; however
---
Generating answers for 20 questions...
Processing question 1/20...
Question: is it fine to exercise with knee pain?
Gen

In [15]:
bleu_scores = []
for result in results:
    ref = nltk.word_tokenize(result['reference'].lower())
    candidate = nltk.word_tokenize(result['model_answer'].lower())
    bleu_score = sentence_bleu([ref], candidate, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu_score)
    
avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
print(f"Average BLEU score over {len(results)} samples: {avg_bleu:.3f}")


Average BLEU score over 20 samples: 0.004


In [16]:
for i, result in enumerate(results[:5]):
    print(f"Question: {result['question']}")
    print(f"Reference: {result['reference']}")
    print(f"Model Answer: {result['model_answer']}")
    print(f"BLEU: {bleu_scores[i]:.2f}")
    print("-----")


Question: is it fine to exercise with knee pain?
Reference: from your description it appears that you may have anterior knee pain which sometimes presents as pain at the back of the knee. the second possibility is that you have over done your exercise and hamstrings are sore and the lower end of the knee cap is inflamed. either way you should rest and ice the area of pain and give it time. i suggest you add nsaids (nonsteroidal anti-inflammatory drugs) for a week for an additional anti-inflammatory action. for further information consult an orthopaedician and traumatologist online --> <link>
Model Answer: It depends on your position, but if you do not feel discomfort or need for rest during that time then don't try this!
BLEU: 0.00
-----
Question: suffering from anxiety restlessness and taking clonazepam & mirtazapine. i had depression & panic attacks for the last 7 years. need a second opinion.
Reference: depression anxiety restlessness and panic attacks are best respond to a combinat

In [17]:
## Comprehensive Evaluation Summary

import numpy as np
from collections import Counter
import re

print("="*80)
print("ZERO-SHOT MEDICAL QA EVALUATION RESULTS")
print("="*80)

print(f"\nDataset: iCliniq Medical Q&A")
print(f"Model: {model_name}")
print(f"Number of samples evaluated: {len(results)}")
print(f"Average BLEU Score: {avg_bleu:.4f}")

# Additional metrics
def calculate_additional_metrics(results):
    answer_lengths = []
    reference_lengths = []
    overlap_scores = []
    
    for result in results:
        # Calculate answer lengths
        model_words = len(result['model_answer'].split())
        ref_words = len(result['reference'].split())
        answer_lengths.append(model_words)
        reference_lengths.append(ref_words)
        
        # Calculate word overlap
        model_words_set = set(result['model_answer'].lower().split())
        ref_words_set = set(result['reference'].lower().split())
        if len(ref_words_set) > 0:
            overlap = len(model_words_set.intersection(ref_words_set)) / len(ref_words_set)
            overlap_scores.append(overlap)
        else:
            overlap_scores.append(0)
    
    return answer_lengths, reference_lengths, overlap_scores

answer_lengths, reference_lengths, overlap_scores = calculate_additional_metrics(results)

print(f"\nAdditional Metrics:")
print(f"Average model answer length: {np.mean(answer_lengths):.1f} words")
print(f"Average reference answer length: {np.mean(reference_lengths):.1f} words")
print(f"Average word overlap: {np.mean(overlap_scores):.3f}")

# Show distribution of BLEU scores
print(f"\nBLEU Score Distribution:")
print(f"Min BLEU: {min(bleu_scores):.4f}")
print(f"Max BLEU: {max(bleu_scores):.4f}")
print(f"Median BLEU: {np.median(bleu_scores):.4f}")
print(f"Std BLEU: {np.std(bleu_scores):.4f}")

print(f"\nNote: Low BLEU scores are expected for GPT-2 on medical QA without fine-tuning.")
print(f"For better performance, consider:")
print(f"1. Using Llama 3.2 with proper authentication")
print(f"2. Fine-tuning the model on medical data")
print(f"3. Using few-shot prompting with medical examples")

print("\n" + "="*80)

ZERO-SHOT MEDICAL QA EVALUATION RESULTS

Dataset: iCliniq Medical Q&A
Model: gpt2
Number of samples evaluated: 20
Average BLEU Score: 0.0041

Additional Metrics:
Average model answer length: 112.2 words
Average reference answer length: 139.1 words
Average word overlap: 0.218

BLEU Score Distribution:
Min BLEU: 0.0007
Max BLEU: 0.0073
Median BLEU: 0.0038
Std BLEU: 0.0017

Note: Low BLEU scores are expected for GPT-2 on medical QA without fine-tuning.
For better performance, consider:
1. Using Llama 3.2 with proper authentication
2. Fine-tuning the model on medical data
3. Using few-shot prompting with medical examples

