In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import torch
from transformers import BertTokenizer, BertModel

# Download required NLTK packages
nltk.download('punkt')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arija\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Function to preprocess text
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'\d+%', '', text)  # Remove percentages
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [3]:
# Function to calculate cosine similarity between two texts
def calculate_cosine_similarity(text1, text2):
    if pd.isna(text1) or pd.isna(text2) or text1 == "" or text2 == "":
        return 0.0
    
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    except:
        return 0.0

# Function to calculate BLEU score between two texts
def calculate_bleu_score(reference_text, candidate_text):
    if pd.isna(reference_text) or pd.isna(candidate_text) or reference_text == "" or candidate_text == "":
        return 0.0
    
    # Tokenize texts
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    
    # Apply smoothing function for short texts
    smoothie = SmoothingFunction().method1
    
    try:
        # Calculate BLEU score with different n-gram weights
        bleu1 = sentence_bleu([reference_tokens], candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
        bleu2 = sentence_bleu([reference_tokens], candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
        bleu3 = sentence_bleu([reference_tokens], candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
        bleu4 = sentence_bleu([reference_tokens], candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
        
        # Average BLEU scores
        avg_bleu = (bleu1 + bleu2 + bleu3 + bleu4) / 4
        return avg_bleu
    except:
        return 0.0

# Function to calculate METEOR score between two texts
def calculate_meteor_score(reference_text, candidate_text):
    if pd.isna(reference_text) or pd.isna(candidate_text) or reference_text == "" or candidate_text == "":
        return 0.0
    
    # Tokenize texts
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    
    try:
        return meteor_score([reference_tokens], candidate_tokens)
    except:
        return 0.0

# Function to calculate BERT score
def calculate_bert_score(reference_text, candidate_text):
    if pd.isna(reference_text) or pd.isna(candidate_text) or reference_text == "" or candidate_text == "":
        return 0.0
    
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    # Tokenize and encode texts
    inputs1 = tokenizer(reference_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs2 = tokenizer(candidate_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Move inputs to the same device as model
    inputs1 = {k: v.to(device) for k, v in inputs1.items()}
    inputs2 = {k: v.to(device) for k, v in inputs2.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs1 = model(**inputs1)
        outputs2 = model(**inputs2)
    
    # Use CLS token embeddings for sentence representation
    embeddings1 = outputs1.last_hidden_state[:, 0, :].cpu().numpy()
    embeddings2 = outputs2.last_hidden_state[:, 0, :].cpu().numpy()
    
    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity(embeddings1, embeddings2)[0][0]
    return similarity

In [4]:
# Improved function to standardize disease names
def standardize_disease_name(name):
    if pd.isna(name):
        return ""
    
    name = str(name).lower().strip()
    # Remove percentages
    name = re.sub(r'\d+%', '', name).strip()
    
    # Common variations of disease names to standardize - EXPANDED
    mapping = {
        # Demam (Fever) related
        'dbd': 'demam berdarah dengue',
        'dengue fever': 'demam berdarah dengue',
        'demam dengue': 'demam berdarah dengue',
        'dengue': 'demam berdarah dengue',
        'demam berdarah': 'demam berdarah dengue',
        'demam biasa': 'common fever',
        'common fever': 'common fever',
        'demam umum': 'common fever',
        
        # Gastro related
        'gastroenteritis akut': 'gastroenteritis',
        'gastroenteritis (ge) akut': 'gastroenteritis',
        'ge akut': 'gastroenteritis',
        'gastroenteritis': 'gastroenteritis',
        'diare akut': 'gastroenteritis',
        
        # Respiratory related
        'infeksi saluran pernapasan atas': 'ispa',
        'ispa': 'ispa',
        'infeksi saluran napas atas': 'ispa',
        'infeksi saluran pernafasan atas': 'ispa',
        'infeksi saluran pernapasan': 'ispa',
        
        # GERD related
        'reflux gastroesofagus': 'gerd',
        'reflux asam lambung': 'gerd',
        'refleks asam lambung': 'gerd',
        'gastroesophageal reflux disease': 'gerd',
        'refleks gastroesofagus': 'gerd',
        'gerd': 'gerd',
        
        # Gastritis related
        'maag': 'gastritis',
        'gastritis akut': 'gastritis',
        'penyakit maag': 'gastritis',
        'penyakit maag akut': 'gastritis',
        'gastritis': 'gastritis',
        
        # Heart related
        'infark miokard akut': 'serangan jantung',
        'serangan jantung': 'serangan jantung',
        'angina pektoris': 'angina',
        'angina': 'angina',
        
        # Asthma related
        'asma bronkial': 'asma',
        'asma exacerbation': 'asma',
        'asma': 'asma',
        'pemburukan asma': 'asma',
        
        # Bronchitis related
        'bronkitis akut': 'bronkitis',
        'bronkitis': 'bronkitis',
        
        # Wound related
        'vulnus laceratum': 'luka robek',
        'luka robek': 'luka robek',
        'luka terbuka': 'luka robek',
        'laceration': 'luka robek',
        'vulnus excoriatum': 'luka lecet',
        'luka lecet': 'luka lecet',
        
        # Head injury related
        'cedera kepala ringan': 'ckr',
        'kepala cedera ringan': 'ckr',
        'ckr': 'ckr',
        
        # Dyspepsia related
        'dispepsia': 'dispepsia',
        'dispepsia fungsional': 'dispepsia',
        
        # Appendicitis related
        'appendisitis akut': 'appendisitis',
        'appendisitis': 'appendisitis',
        'apendisitis': 'appendisitis',
        
        # UTI related
        'infeksi saluran kemih': 'isk',
        'isk': 'isk',
        'infeksi saluran kemih akut': 'isk',
        
        # Additional mappings
        'intoleransi laktosa': 'intoleransi laktosa',
        'pneumonia': 'pneumonia',
        'hipertensi': 'hipertensi',
        'hipertensi akut': 'hipertensi',
        'hipertensi stage 1': 'hipertensi',
        'luka bakar': 'luka bakar',
        'burn injury': 'luka bakar',
        'tonsilitis': 'tonsilitis',
        'faringitis': 'faringitis',
        'vertigo': 'vertigo',
        'benign paroxysmal positional vertigo': 'vertigo',
        'bppv': 'vertigo',
        'influenza': 'influenza',
        'kolik renal': 'kolik renal',
        'batu ginjal': 'batu ginjal',
        'peritonitis': 'peritonitis'
    }
    
    # Apply mapping if available - using partial matching for more flexibility
    for key, value in mapping.items():
        if key in name:
            return value
    
    # If no mapping found, return the cleaned name
    return name

# Function to extract percentage from diagnosis text
def extract_percentage(text):
    match = re.search(r'(\d+)%', text)
    return float(match.group(1))/100 if match else 0.5  # Default to 50% if not found

In [5]:
# Function to parse diagnoses from raw answer text
def parse_diagnoses(answer_text):
    if pd.isna(answer_text):
        return []
    
    # Split by semicolon and process each diagnosis
    diagnoses = []
    percentages = []
    
    for item in answer_text.split(';'):
        item = item.strip()
        if not item:
            continue
            
        # Extract diagnosis name and percentage 
        match = re.search(r'(.*?)(?:\s+(\d+)%)?$', item)
        if match and match.group(1).strip():
            diagnoses.append(standardize_disease_name(match.group(1).strip()))
            percentages.append(extract_percentage(item))
    
    # Create a list of tuples (diagnosis, percentage)
    return list(zip(diagnoses, percentages))

# Main processing function to analyze and evaluate diagnoses
def process_medical_diagnoses():
    # Load Excel file with both models' outputs
    xlsx_path = '30 sample penyakit - hasil prompt LLM.xlsx'
    
    try:
        # Load raw data from both sheets
        claude_raw = pd.read_excel(xlsx_path, sheet_name='Claude 3.5 Haiku')
        qwen_raw = pd.read_excel(xlsx_path, sheet_name='Qwen 2.5 72B')
        
        # Fill forward the question IDs and questions
        claude_raw['No'] = claude_raw['No'].ffill()
        claude_raw['Question'] = claude_raw['Question'].ffill()
        qwen_raw['No'] = qwen_raw['No'].ffill()
        qwen_raw['Question'] = qwen_raw['Question'].ffill()
        
        # Group by question to organize multiple answers per question
        claude_grouped = claude_raw.groupby(['No', 'Question'])
        qwen_grouped = qwen_raw.groupby(['No', 'Question'])
        
        # Create the unified question list - ensure we have all questions
        all_questions = list(set(claude_grouped.groups.keys()) | set(qwen_grouped.groups.keys()))
        all_questions.sort(key=lambda x: float(x[0]) if x[0] is not None and not pd.isna(x[0]) else float('inf'))
        
        # Results container
        results = []
        
        # Process each question
        for idx, (no, question) in enumerate(all_questions):
            try:
                print(f"Processing question {idx+1}/{len(all_questions)}: {no}")
                
                # Get Claude answers (ground truth)
                if (no, question) in claude_grouped.groups:
                    claude_rows = claude_grouped.get_group((no, question))
                    claude_answers = claude_rows['Answer'].tolist()
                    claude_diagnoses = []
                    
                    # Parse all Claude answers to get diagnoses with percentages
                    for ans in claude_answers:
                        if not pd.isna(ans):
                            claude_diagnoses.extend(parse_diagnoses(ans))
                    
                    # Get the full text explanations
                    claude_explanations = [text for text in claude_rows['Full Answer'].tolist() if not pd.isna(text)]
                    claude_full_text = ' '.join(claude_explanations)
                else:
                    # Skip if no ground truth available
                    print(f"Warning: No Claude data for question {no}")
                    continue
                
                # Get Qwen candidate answers
                if (no, question) in qwen_grouped.groups:
                    qwen_rows = qwen_grouped.get_group((no, question))
                    qwen_answers = [ans for ans in qwen_rows['Answer'].tolist() if not pd.isna(ans)]
                    
                    # Select the best matching Qwen answer
                    best_qwen, similarity_score = select_best_qwen_answer(qwen_answers, claude_diagnoses)
                    
                    # Parse the best Qwen answer
                    qwen_diagnoses = parse_diagnoses(best_qwen)
                    
                    # Get Qwen full text matching the best answer
                    qwen_explanation_rows = qwen_rows[qwen_rows['Answer'] == best_qwen]
                    qwen_explanations = [text for text in qwen_explanation_rows['Full Answer'].tolist() if not pd.isna(text)]
                    qwen_full_text = ' '.join(qwen_explanations) if qwen_explanations else ""
                else:
                    print(f"Warning: No Qwen data for question {no}")
                    best_qwen = ""
                    similarity_score = 0.0
                    qwen_diagnoses = []
                    qwen_full_text = ""
                
                # Calculate diagnosis match metrics
                diagnosis_metrics = calculate_diagnosis_match_metrics(claude_diagnoses, qwen_diagnoses)
                
                # Calculate NLP similarity metrics
                # For answer (diagnosis names only)
                claude_answer_text = '; '.join([f"{d} {int(p*100)}%" for d, p in claude_diagnoses])
                
                nlp_metrics = {
                    'answer_cosine_similarity': calculate_cosine_similarity(
                        preprocess_text(claude_answer_text), 
                        preprocess_text(best_qwen)
                    ),
                    'full_answer_cosine_similarity': calculate_cosine_similarity(
                        preprocess_text(claude_full_text), 
                        preprocess_text(qwen_full_text)
                    ),
                    'bleu_score': calculate_bleu_score(
                        preprocess_text(claude_answer_text), 
                        preprocess_text(best_qwen)
                    ),
                    'meteor_score': calculate_meteor_score(
                        preprocess_text(claude_answer_text), 
                        preprocess_text(best_qwen)
                    ),
                    'bert_score': calculate_bert_score(
                        preprocess_text(claude_answer_text), 
                        preprocess_text(best_qwen)
                    )
                }
                
                # Combined all metrics into one result entry
                result = {
                    'No': no,
                    'Question': question,
                    'Claude_Diagnoses': claude_answer_text,
                    'Selected_Qwen_Answer': best_qwen,
                    'Similarity_Score': similarity_score,
                    **diagnosis_metrics,
                    **nlp_metrics
                }
                
                results.append(result)
                
            except Exception as e:
                print(f"Error processing question {no}: {str(e)}")
                continue
        
        # Create final results dataframe
        results_df = pd.DataFrame(results)
        
        # Calculate overall metrics
        overall_metrics = {
            'total_questions': len(results),
            'average_precision': results_df['precision'].mean(),
            'average_recall': results_df['recall'].mean(),
            'average_f1_score': results_df['f1_score'].mean(),
            'average_answer_cosine_similarity': results_df['answer_cosine_similarity'].mean(),
            'average_full_answer_cosine_similarity': results_df['full_answer_cosine_similarity'].mean(),
            'average_bleu_score': results_df['bleu_score'].mean(),
            'average_meteor_score': results_df['meteor_score'].mean(),
            'average_bert_score': results_df['bert_score'].mean()
        }
        
        return results_df, overall_metrics
    
    except Exception as e:
        print(f"Error processing Excel file: {str(e)}")
        return None, None

In [6]:
# Function to find the best matching Qwen answer for a given Claude answer
def select_best_qwen_answer(qwen_answers, claude_diagnoses):
    """
    Selects the best matching Qwen answer based on overlap with Claude diagnoses
    
    Args:
        qwen_answers: List of Qwen answer strings
        claude_diagnoses: List of (diagnosis, score) tuples from Claude
    
    Returns:
        best_answer: The Qwen answer with highest similarity score
    """
    if not qwen_answers or not claude_diagnoses:
        return ""
    
    best_score = -1
    best_answer = ""
    
    claude_diseases = [d[0] for d in claude_diagnoses]
    
    for qwen_ans in qwen_answers:
        # Parse the current Qwen answer
        qwen_parsed = parse_diagnoses(qwen_ans)
        qwen_diseases = [d[0] for d in qwen_parsed]
        
        # Calculate overlap score - how many diseases match
        match_count = sum(1 for d in qwen_diseases if d in claude_diseases)
        match_score = match_count / max(len(claude_diseases), len(qwen_diseases), 1)
        
        # Also consider text similarity as fallback
        text_sim = calculate_cosine_similarity(
            preprocess_text(qwen_ans), 
            preprocess_text('; '.join([d for d, _ in claude_diagnoses]))
        )
        
        # Combined score favoring matching diseases but also considering text similarity
        combined_score = (match_score * 0.7) + (text_sim * 0.3)
        
        if combined_score > best_score:
            best_score = combined_score
            best_answer = qwen_ans
            
    return best_answer, best_score

In [7]:
# Function to calculate diagnosis matching metrics
def calculate_diagnosis_match_metrics(claude_diagnoses, qwen_diagnoses):
    """
    Calculate precision, recall, and F1 score for disease matching
    
    Args:
        claude_diagnoses: List of (disease, score) tuples from Claude
        qwen_diagnoses: List of (disease, score) tuples from Qwen
    
    Returns:
        dict with precision, recall, f1_score, etc.
    """
    if not claude_diagnoses or not qwen_diagnoses:
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0,
            'matched': 0,
            'total_ground_truth': len(claude_diagnoses),
            'total_predictions': len(qwen_diagnoses)
        }
    
    # Extract just the disease names
    claude_diseases = [d[0] for d in claude_diagnoses]
    qwen_diseases = [d[0] for d in qwen_diagnoses]
    
    # Count matches
    matches = sum(1 for d in qwen_diseases if d in claude_diseases)
    
    # Calculate precision: matches / qwen predictions
    precision = matches / len(qwen_diseases) if qwen_diseases else 0
    
    # Calculate recall: matches / claude diagnoses (ground truth)
    recall = matches / len(claude_diseases) if claude_diseases else 0
    
    # Calculate F1 score
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'matched': matches,
        'total_ground_truth': len(claude_diseases),
        'total_predictions': len(qwen_diseases)
    }

In [8]:
# Function to run the full evaluation and display results
def main():
    print("Starting medical diagnosis evaluation...")
    results_df, overall_metrics = process_medical_diagnoses()
    
    if results_df is not None:
        # Display summary statistics
        print("\n=== OVERALL EVALUATION METRICS ===")
        for metric, value in overall_metrics.items():
            print(f"{metric.replace('_', ' ').title()}: {value:.4f}")
        
        # Display per-question results
        print("\n=== TOP 5 QUESTIONS WITH HIGHEST F1 SCORES ===")
        top_questions = results_df.sort_values('f1_score', ascending=False).head(5)
        for _, row in top_questions.iterrows():
            print(f"Question {row['No']}: F1 = {row['f1_score']:.4f}, Precision = {row['precision']:.4f}, Recall = {row['recall']:.4f}")
            print(f"  Claude: {row['Claude_Diagnoses']}")
            print(f"  Qwen: {row['Selected_Qwen_Answer']}")
            print()
        
        # Display questions with lowest scores
        print("\n=== QUESTIONS WITH LOWEST F1 SCORES ===")
        bottom_questions = results_df.sort_values('f1_score').head(5)
        for _, row in bottom_questions.iterrows():
            print(f"Question {row['No']}: F1 = {row['f1_score']:.4f}, Precision = {row['precision']:.4f}, Recall = {row['recall']:.4f}")
            print(f"  Claude: {row['Claude_Diagnoses']}")
            print(f"  Qwen: {row['Selected_Qwen_Answer']}")
            print()
        
        # Save results to CSV 
        results_df.to_csv('medical_diagnosis_evaluation_results.csv', index=False)
        print("\nResults saved to medical_diagnosis_evaluation_results.csv")
    else:
        print("Error: Evaluation failed.")

if __name__ == "__main__":
    main()

Starting medical diagnosis evaluation...
Processing question 1/46: 1.0
Processing question 2/46: 2.0
Processing question 3/46: 3.0
Processing question 4/46: 4.0
Processing question 5/46: 5.0
Processing question 6/46: 6.0
Processing question 7/46: 7.0
Processing question 8/46: 7.0
Processing question 9/46: 8.0
Processing question 10/46: 9.0
Processing question 11/46: 10.0
Processing question 12/46: 11.0
Processing question 13/46: 12.0
Processing question 14/46: 13.0
Processing question 15/46: 14.0
Processing question 16/46: 15.0
Processing question 17/46: 16.0
Processing question 18/46: 17.0
Processing question 19/46: 18.0
Processing question 20/46: 19.0
Processing question 21/46: 20.0
Processing question 22/46: 21.0
Processing question 23/46: 22.0
Processing question 24/46: 23.0
Processing question 25/46: 24.0
Processing question 26/46: 25.0
Processing question 27/46: 26.0
Processing question 28/46: 27.0
Processing question 29/46: 28.0
Processing question 30/46: 29.0
Processing questio

In [1]:
"""
Analysis of Low Similarity Scores in Medical Diagnosis Evaluation

Summary of Scores
From the evaluation results, we see:
- Average Precision: 0.7778 (good)
- Average Recall: 0.3926 (low)
- Average F1 Score: 0.5074 (moderate)
- Text-based metrics:
  - Answer Cosine Similarity: 0.3470 (low)
  - BLEU Score: 0.1485 (very low)
  - METEOR Score: 0.2462 (low)
  - BERT Score: 0.8331 (good)

Why Are Some Scores Low?
1. Limitations of Text-Based Metrics

Text similarity metrics like BLEU, METEOR, and cosine similarity have inherent limitations for this task:

- BLEU (0.1485) measures n-gram overlap, which is very sensitive to word order and exact matches
- METEOR (0.2462) performs better but still depends on exact or stemmed word matches
- Cosine similarity (0.3470) works with bag-of-words but loses semantic relationships

Only BERT Score (0.8331) performs well because it's based on contextual embeddings that capture semantic meaning.

2. Medical Terminology Variation

Even with improved standardization, medical terms can be expressed in many ways:

```
# Example from Question 28.0:
Ground Truth: "ckr 90%; luka robek 100%; gangguan keseimbangan neurologis lain 70%"
Qwen: "Benign Paroxysmal Positional Vertigo (BPPV)"
```

BPPV is a specific type of neurological balance disorder, but our mapping doesn't capture this relationship.

3. Multilingual Context

The dataset contains a mix of Indonesian and English medical terms:
- "demam berdarah dengue" vs. "dengue fever"
- "luka robek" vs. "laceration"

These variations make text-based matching more challenging.

## Suggestions for Improving Scores

1. **Enhanced disease mapping**:
   - Add more semantic relationships between medical conditions
   - Map "BPPV" to "gangguan keseimbangan neurologis"
   - Map "Infeksi Lokal" to various infection types

2. **Weighted evaluation metrics**:
   - Consider the confidence scores when calculating matches
   - A diagnosis with 80% confidence should count more than one with 30%

3. **Domain-specific evaluation**:
   - Develop a medical ontology-based similarity metric
   - Use hierarchical relationships (e.g., BPPV is a subtype of balance disorder)

4. **Improve dataset alignment**:
   - Fix question format inconsistencies
   - Ensure both models have answers for all questions
"""

'\nAnalysis of Low Similarity Scores in Medical Diagnosis Evaluation\n\nSummary of Scores\nFrom the evaluation results, we see:\n- Average Precision: 0.7778 (good)\n- Average Recall: 0.3926 (low)\n- Average F1 Score: 0.5074 (moderate)\n- Text-based metrics:\n  - Answer Cosine Similarity: 0.3470 (low)\n  - BLEU Score: 0.1485 (very low)\n  - METEOR Score: 0.2462 (low)\n  - BERT Score: 0.8331 (good)\n\nWhy Are Some Scores Low?\n1. Limitations of Text-Based Metrics\n\nText similarity metrics like BLEU, METEOR, and cosine similarity have inherent limitations for this task:\n\n- BLEU (0.1485) measures n-gram overlap, which is very sensitive to word order and exact matches\n- METEOR (0.2462) performs better but still depends on exact or stemmed word matches\n- Cosine similarity (0.3470) works with bag-of-words but loses semantic relationships\n\nOnly BERT Score (0.8331) performs well because it\'s based on contextual embeddings that capture semantic meaning.\n\n2. Medical Terminology Variatio