In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import re
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Download required NLTK packages
nltk.download('punkt')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arija\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Function to preprocess text
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'\d+%', '', text)  # Remove percentages
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [3]:
# Function to calculate cosine similarity between two texts
def calculate_cosine_similarity(text1, text2):
    if pd.isna(text1) or pd.isna(text2) or text1 == "" or text2 == "":
        return 0.0
    
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    except:
        return 0.0

In [4]:
# Function to calculate BLEU score between two texts
def calculate_bleu_score(reference_text, candidate_text):
    if pd.isna(reference_text) or pd.isna(candidate_text) or reference_text == "" or candidate_text == "":
        return 0.0
    
    # Tokenize texts
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    
    # Apply smoothing function for short texts
    smoothie = SmoothingFunction().method1
    
    try:
        # Calculate BLEU score with different n-gram weights
        bleu1 = sentence_bleu([reference_tokens], candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
        bleu2 = sentence_bleu([reference_tokens], candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
        bleu3 = sentence_bleu([reference_tokens], candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
        bleu4 = sentence_bleu([reference_tokens], candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
        
        # Average BLEU scores
        avg_bleu = (bleu1 + bleu2 + bleu3 + bleu4) / 4
        return avg_bleu
    except:
        return 0.0

In [5]:
# Function to calculate METEOR score between two texts
def calculate_meteor_score(reference_text, candidate_text):
    if pd.isna(reference_text) or pd.isna(candidate_text) or reference_text == "" or candidate_text == "":
        return 0.0
    
    # Tokenize texts
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    
    try:
        return meteor_score([reference_tokens], candidate_tokens)
    except:
        return 0.0

In [6]:
# Function to calculate BERT score
def calculate_bert_score(reference_text, candidate_text):
    if pd.isna(reference_text) or pd.isna(candidate_text) or reference_text == "" or candidate_text == "":
        return 0.0
    
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    # Tokenize and encode texts
    inputs1 = tokenizer(reference_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs2 = tokenizer(candidate_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Move inputs to the same device as model
    inputs1 = {k: v.to(device) for k, v in inputs1.items()}
    inputs2 = {k: v.to(device) for k, v in inputs2.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs1 = model(**inputs1)
        outputs2 = model(**inputs2)
    
    # Use CLS token embeddings for sentence representation
    embeddings1 = outputs1.last_hidden_state[:, 0, :].cpu().numpy()
    embeddings2 = outputs2.last_hidden_state[:, 0, :].cpu().numpy()
    
    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity(embeddings1, embeddings2)[0][0]
    return similarity

In [7]:
import pandas as pd

# Load Excel file
df = pd.read_excel('30 sample penyakit - hasil prompt LLM.xlsx', sheet_name='Claude 3.5 Haiku', header=0)
df.head()


Unnamed: 0,No,Question,Answer,Full Answer,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16373,Unnamed: 16374,Unnamed: 16375,Unnamed: 16376,Unnamed: 16377,Unnamed: 16378,Unnamed: 16379,Unnamed: 16380,Unnamed: 16381,Unnamed: 16382
0,1.0,Pasien mengalami demam pada waktu malam. Sebel...,Demam Tifoid 70%,"Berdasarkan gejala yang disampaikan, saya akan...",,,,,,,...,,,,,,,,,,
1,,,Demam Berdarah Dengue 30%,,,,,,,,...,,,,,,,,,,
2,2.0,Pasien mengalami buang air besar cair lebih da...,Gastroenteritis (GE) Akut 70%,"Berdasarkan informasi klinis yang diberikan, s...",,,,,,,...,,,,,,,,,,
3,,,Intoleransi Laktosa 30%,,,,,,,,...,,,,,,,,,,
4,3.0,Pasien datang dengan keluhan demam selama 6 ha...,DBD 80%,"Berdasarkan informasi yang diberikan, saya aka...",,,,,,,...,,,,,,,,,,


In [8]:
# Drop unnecessary Rows and columns
df['No'] = df['No'].ffill()
df['Question'] = df['Question'].ffill()

# Group by 'No' and 'Question', and aggregate answers and full answers
df_grouped = df.groupby(['No', 'Question'], as_index=False).agg({
    'Answer': lambda x: '; '.join(x.dropna().astype(str)),
    'Full Answer': lambda x: '\n\n---\n\n'.join(x.dropna().astype(str))
})

# Display
print(df_grouped)


      No                                           Question  \
0    1.0  Pasien mengalami demam pada waktu malam. Sebel...   
1    2.0  Pasien mengalami buang air besar cair lebih da...   
2    3.0  Pasien datang dengan keluhan demam selama 6 ha...   
3    4.0  Pasien menderita demam disertai munculnya brun...   
4    5.0  Pasien nyeri dada sejak 4 jam lalu seperti ter...   
5    6.0  Pasien mengalami sesak nafas sejak siang. #Has...   
6    7.0  Pasien mengalami sesak napas sejak sehari lalu...   
7    8.0  Pasien mengalami sesak nafas sekitar 3 hari, t...   
8    9.0  Pasien nyeri ulu hati/perut bagian atas sejak ...   
9   10.0  Pasien merasa nyeri di ulu hati sampai terasa ...   
10  11.0  Pasien mengalami buang air besar cair 5 kali l...   
11  12.0  Pasien nyeri pinggang kanan sejak 6 jam lalu, ...   
12  13.0  Pasien mengalami nyeri perut sejak sehari lalu...   
13  14.0  Pasien mengalami nyeri perut, mulas, mual, mun...   
14  15.0  Pasien mengalami nyeri hebat di perut bawah (

In [9]:
# Load Excel file 
df1 = pd.read_excel('30 sample penyakit - hasil prompt LLM.xlsx', sheet_name='Qwen 2.5 72B', header=0)
df1.head()

Unnamed: 0,No,Question,Answer,Full Answer
0,1.0,Pasien mengalami demam pada waktu malam. Sebel...,Demam Biasa,"Berdasarkan gejala yang disampaikan, yaitu dem..."
1,,,Dispepsia,
2,,,Asma,
3,,,Infeksi Saluran Pernapasan Atas,
4,,,Infeksi Saluran Kemih,


In [10]:
# Drop unnecessary Rows and columns
df1['No'] = df1['No'].ffill()
df1['Question'] = df1['Question'].ffill()

# Group by 'No' and 'Question', and aggregate answers and full answers
df1_grouped = df1.groupby(['No', 'Question'], as_index=False).agg({
    'Answer': lambda x: '; '.join(x.dropna().astype(str)),
    'Full Answer': lambda x: '\n\n---\n\n'.join(x.dropna().astype(str))
})

# Display
print(df1_grouped)

      No                                           Question  \
0    1.0  Pasien mengalami demam pada waktu malam. Sebel...   
1    2.0  Pasien mengalami buang air besar cair lebih da...   
2    3.0  Pasien datang dengan keluhan demam selama 6 ha...   
3    4.0  Pasien menderita demam disertai munculnya brun...   
4    5.0  Pasien nyeri dada sejak 4 jam lalu seperti ter...   
5    6.0  Pasien mengalami sesak nafas sejak siang. #Has...   
6    7.0  Pasien mengalami sesak napas sejak sehari lalu...   
7    8.0  Pasien mengalami sesak nafas sekitar 3 hari, t...   
8    9.0  Pasien nyeri ulu hati/perut bagian atas sejak ...   
9   10.0  Pasien merasa nyeri di ulu hati sampai terasa ...   
10  11.0  Pasien mengalami buang air besar cair 5 kali l...   
11  12.0  Pasien nyeri pinggang kanan sejak 6 jam lalu, ...   
12  13.0  Pasien mengalami nyeri perut sejak sehari lalu...   
13  14.0  Pasien mengalami nyeri perut, mulas, mual, mun...   
14  15.0  Pasien mengalami nyeri hebat di perut bawah, 

In [11]:
# Comparing Claude 3.5 Haiku with Qwen AI
# Normalize the data and make sure both dataframes have matching entries

# Create copies to avoid modifying the original dataframes
claude_df = df_grouped.copy()
qwen_df = df1_grouped.copy()

# Create a mapping function to match the standardized disease names
def standardize_disease_name(name):
    name = name.lower().strip()
    # Common variations of disease names to standardize
    mapping = {
        'dbd': 'demam berdarah dengue',
        'dengue fever': 'demam berdarah dengue',
        'demam dengue': 'demam berdarah dengue',
        'gastroenteritis akut': 'gastroenteritis',
        'gastroenteritis (ge) akut': 'gastroenteritis',
        'ge akut': 'gastroenteritis',
        'infeksi saluran pernapasan atas': 'ispa',
        'reflux gastroesofagus': 'gerd',
        'reflux asam lambung': 'gerd',
        'refleks asam lambung': 'gerd',
        'gastroesophageal reflux disease': 'gerd',
        'refleks gastroesofagus': 'gerd',
        'maag': 'gastritis',
        'gastritis akut': 'gastritis',
        'penyakit maag': 'gastritis',
        'penyakit maag akut': 'gastritis',
        'infark miokard akut': 'serangan jantung',
        'asma bronkial': 'asma',
        'asma exacerbation': 'asma',
        'angina pektoris': 'angina',
        'vulnus laceratum': 'luka robek',
        'vulnus excoriatum': 'luka lecet',
        'cedera kepala ringan': 'ckr',
        'kepala cedera ringan': 'ckr'
    }
    
    # Apply mapping if available
    for key, value in mapping.items():
        if key in name:
            return value
    
    # Remove common words that don't affect the diagnosis
    name = re.sub(r'\d+%', '', name)  # Remove percentages
    return name

In [12]:
def parse_diagnoses(answer_text):
    if pd.isna(answer_text):
        return []
    
    # Split by semicolon and process each diagnosis
    diagnoses = []
    for item in answer_text.split(';'):
        # Extract diagnosis name and remove percentage if present
        match = re.search(r'(.*?)(?:\s+\d+%)?$', item.strip())
        if match and match.group(1).strip():
            diagnoses.append(standardize_disease_name(match.group(1).strip()))
    
    return diagnoses

In [13]:
# Apply parsing to both dataframes
claude_df['parsed_diagnoses'] = claude_df['Answer'].apply(parse_diagnoses)
qwen_df['parsed_diagnoses'] = qwen_df['Answer'].apply(parse_diagnoses)

In [14]:
# Create a function to calculate diagnosis match rate
def calculate_diagnosis_match(claude_diagnoses, qwen_diagnoses):
    if not claude_diagnoses or not qwen_diagnoses:
        return 0.0
    
    # Count matches
    matches = sum(1 for d in qwen_diagnoses if any(std_d == standardize_disease_name(d) for std_d in claude_diagnoses))
    
    # Calculate precision: matches / qwen predictions
    precision = matches / len(qwen_diagnoses) if qwen_diagnoses else 0
    
    # Calculate recall: matches / claude diagnoses (ground truth)
    recall = matches / len(claude_diagnoses) if claude_diagnoses else 0
    
    # Calculate F1 score
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'matched': matches,
        'total_ground_truth': len(claude_diagnoses),
        'total_predictions': len(qwen_diagnoses)
    }

In [15]:
# Choose the best Qwen answer out of 5 candidates by evaluating against both Claude answers
def select_best_qwen_answer(qwen_answers, claude_answers, claude_scores):
    best_score = -1
    best_answer = ''
    
    for qwen_ans in qwen_answers:
        score = 0
        for ref_ans, weight in zip(claude_answers, claude_scores):
            score += calculate_cosine_similarity(preprocess_text(ref_ans), preprocess_text(qwen_ans)) * weight
        if score > best_score:
            best_score = score
            best_answer = qwen_ans
    return best_answer

In [16]:
# Load full Excel file
xlsx_path = '30 sample penyakit - hasil prompt LLM.xlsx'
claude_df = pd.read_excel(xlsx_path, sheet_name='Claude 3.5 Haiku')
qwen_df = pd.read_excel(xlsx_path, sheet_name='Qwen 2.5 72B')

# Fill NaNs
claude_df['No'] = claude_df['No'].ffill()
claude_df['Question'] = claude_df['Question'].ffill()
qwen_df['No'] = qwen_df['No'].ffill()
qwen_df['Question'] = qwen_df['Question'].ffill()

# Group Claude and Qwen answers by question
claude_grouped = claude_df.groupby(['No', 'Question'])
qwen_grouped = qwen_df.groupby(['No', 'Question'])

results = []

In [20]:
# find any missing keys up front
claude_keys = set(claude_grouped.groups)
qwen_keys   = set(qwen_grouped.groups)
missing     = claude_keys - qwen_keys
if missing:
    print(f"Skipping {len(missing)} keys missing in Qwen:", missing)

Skipping 5 keys missing in Qwen: {(4.0, 'Pasien menderita demam disertai munculnya bruntus kemerahan dan panas di telapak tangan kiri, lengan kiri dan leher kiri. Lengan nyeri, kaku dan sulit diangkat (apakah kemerahan juga ada di lengan?). Sudah berobat tapi belum ada membaik. #Hasil pemeriksaan TD = 108/88 mmHg, R = 20 brpm,  S= 38 C, Status generalis = batas normal, status lokalis = leher kiri, bahu kiri dan telapak tangan kiri (tampak apa? seharusnya tampak kemerahan dsb). Pasien didiagnosis penyakit apa?'), (7.0, 'Pasien mengalami sesak napas sejak sehari lalu, batuk berdahak, pilek, dan buang air kecil lebih dari 10 kali di malam hari. Pasien sudah  mengonsumsi obat tapi belum ada perubahan. #Hasil pemeriksaan TD = 135/70 mmHg, S = 36,5°C, RR= 24 x/mnt, N= 75 x/mnt, dan Sp02= 95%. Pasien didiagnosis penyakit apa?\n'), (30.0, 'Pasien terkena petasan di tangan kiri (bisa dispesifikkan bentuk luka selain luka bakar, apakah lecet/robek/terbuka/tembus?). Pasien didiagnosis penyakit ap

In [21]:
for key in tqdm(claude_grouped.groups.keys()):
    no, question = key
    try:
        # skip keys that Qwen never produced
        if key not in qwen_grouped.groups:
            print(f"Skipping question {no!r}: not found in Qwen output")
            continue

        qwen_rows = qwen_grouped.get_group(key)
        claude_rows = claude_grouped.get_group(key)
        
        # Extract two Claude answers and estimate their scores from text
        def extract_score_from_text(text):
            match = re.search(r'(\\d+)%', text)
            return float(match.group(1)) / 100 if match else 0.5  # default to 50% if not found

        claude_answers_raw = claude_rows['Answer'].tolist()[:2]
        claude_answers = [re.sub(r'\\(\\d+%\\)', '', ans).strip() for ans in claude_answers_raw]  # remove score text
        claude_scores = [extract_score_from_text(ans) for ans in claude_answers_raw]

        # Normalize scores
        total = sum(claude_scores)
        claude_scores = [s / total for s in claude_scores] if total else [0.5, 0.5]

        # Normalize scores if sum is not 1
        total = sum(claude_scores)
        if total != 0:
            claude_scores = [s / total for s in claude_scores]
        
        # Extract five Qwen answers
        qwen_answers = [str(ans) for ans in qwen_rows['Answer'].tolist()[:5]]
        qwen_best = select_best_qwen_answer(qwen_answers, claude_answers, claude_scores)

        # Get full answers for metrics
        full_claude_text = ' '.join([str(x) for x in claude_rows['Full Answer'].tolist()[:2]])
        full_qwen_text = ' '.join([str(x) for x in qwen_rows[qwen_rows['Answer'] == qwen_best]['Full Answer'].tolist()])


        metrics = {
            'No': no,
            'Question': question,
            'Selected_Qwen_Answer': qwen_best,
            'Answer_Cosine_Similarity': sum(
                calculate_cosine_similarity(preprocess_text(ref), preprocess_text(qwen_best)) * w
                for ref, w in zip(claude_answers, claude_scores)
            ),
            'Full_Answer_Cosine_Similarity': calculate_cosine_similarity(preprocess_text(full_claude_text), preprocess_text(full_qwen_text)),
            'BLEU': sum(
                calculate_bleu_score(preprocess_text(ref), preprocess_text(qwen_best)) * w
                for ref, w in zip(claude_answers, claude_scores)
            ),
            'METEOR': sum(
                calculate_meteor_score(preprocess_text(ref), preprocess_text(qwen_best)) * w
                for ref, w in zip(claude_answers, claude_scores)
            ),
            'BERTScore': sum(
                calculate_bert_score(preprocess_text(ref), preprocess_text(qwen_best)) * w
                for ref, w in zip(claude_answers, claude_scores)
            )
        }

        results.append(metrics)
    except Exception as e:
        print(f"Error processing question (‘{question}’): {e!r}")
        import traceback; traceback.print_exc()
        continue

results_df = pd.DataFrame(results)
results_df.head()

  7%|▋         | 3/45 [00:07<01:31,  2.18s/it]

Skipping question 4.0: not found in Qwen output


 11%|█         | 5/45 [00:09<01:05,  1.65s/it]

Skipping question 6.0: not found in Qwen output
Skipping question 7.0: not found in Qwen output


 31%|███       | 14/45 [00:27<01:11,  2.31s/it]

Skipping question 15.0: not found in Qwen output


 64%|██████▍   | 29/45 [00:54<00:28,  1.79s/it]

Skipping question 30.0: not found in Qwen output


100%|██████████| 45/45 [01:31<00:00,  2.04s/it]


Unnamed: 0,No,Question,Selected_Qwen_Answer,Answer_Cosine_Similarity,Full_Answer_Cosine_Similarity,BLEU,METEOR,BERTScore
0,1.0,Pasien mengalami demam pada waktu malam. Sebel...,Demam Biasa,0.298326,0.385307,0.210292,0.211207,0.92384
1,2.0,Pasien mengalami buang air besar cair lebih da...,Intoleransi Laktosa,0.5,0.037473,0.347995,0.46875,0.918498
2,3.0,Pasien datang dengan keluhan demam selama 6 ha...,Demam Biasa,0.0,0.405296,0.0,0.0,0.875557
3,5.0,Pasien nyeri dada sejak 4 jam lalu seperti ter...,Infark Miokard Akut (Serangan Jantung),0.5,0.41024,0.5,0.498,0.940037
4,8.0,"Pasien mengalami sesak nafas sekitar 3 hari, t...",Asma,0.289869,0.468026,0.078764,0.131579,0.915376


In [22]:
# Create results DataFrame
results_df = pd.DataFrame(results)

# Show average scores
avg_metrics = {
    'Average_Answer_Cosine_Similarity': results_df['Answer_Cosine_Similarity'].mean(),
    'Average_Full_Answer_Cosine_Similarity': results_df['Full_Answer_Cosine_Similarity'].mean(),
    'Average_BLEU': results_df['BLEU'].mean(),
    'Average_METEOR': results_df['METEOR'].mean(),
    'Average_BERTScore': results_df['BERTScore'].mean()
}
print("Average Metrics:")
for k, v in avg_metrics.items():
    print(f"{k}: {v:.4f}")


Average Metrics:
Average_Answer_Cosine_Similarity: 0.3177
Average_Full_Answer_Cosine_Similarity: 0.2556
Average_BLEU: 0.2003
Average_METEOR: 0.2714
Average_BERTScore: 0.8907
