In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
import warnings
import openai
from dotenv import load_dotenv
warnings.filterwarnings('ignore')
from tqdm import tqdm

In [2]:
load_dotenv()
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY2"))

In [3]:
df = pd.read_csv('../cleaning_before_eval/ready_to_eval.csv')
df.head()

Unnamed: 0,No,Question,Dr Answer,Claude Answer,Qwen Answer,GPT Answer,Deepseek RAG Answer,Deepseek non RAG Answer,Claude Full Answer,Qwen Full Answer,GPT Full Answer,Deepseek RAG Full Answer,Deepseek non RAG Full Answer
0,1.0,Pasien mengalami demam pada waktu malam. Sebel...,"Demam Tifoid 70%, Demam Berdarah Dengue 30%","Demam Tifoid, Leptospirosis, Demam Berdarah De...","Demam Biasa, Dispepsia, Asma, Infeksi Saluran ...","Demam Biasa, Demam Tifoid, Demam Berdarah Deng...","Demam Tifoid (Typhoid Fever), Malaria, Leptosp...","Malaria, Infeksi Saluran Kemih (ISK) atau Piel...","Berdasarkan gejala yang disampaikan, saya akan...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan gejala yang dialami pasien, yaitu ...",Berdasarkan gejala yang Anda sebutkan—demam pa...
1,2.0,Pasien mengalami buang air besar cair lebih da...,"Gastroenteritis (GE) Akut 70%, Intoleransi Lak...","Gastroenteritis (GE) Akut, Disentri, Infeksi E...","Diare Akut, Gastroenteritis, Dispepsia, Infeks...","Gastroenteritis Akut, Infeksi Escherichia coli...","Gastroenteritis Akut, Keracunan Makanan, Infek...","Gastroenteritis Akut, Keracunan Makanan, Koler...","Berdasarkan informasi klinis yang diberikan, s...","Berdasarkan gejala yang disampaikan, yaitu bua...","Berdasarkan informasi yang diberikan, pasien m...","Berdasarkan informasi yang diberikan, pasien m...",Berdasarkan gejala yang Anda deskripsikan—diar...
2,3.0,Pasien datang dengan keluhan demam selama 6 ha...,DBD 80%,"Demam Tifoid, Malaria, DBD, Leptospirosis, Inf...","Demam Biasa, Dispepsia, Asma, Demam Dengue, Lu...","Demam Berdarah Dengue, Demam Tifoid, Malaria, ...","Demam Berdarah Dengue (DBD), Demam Tifoid, Lep...","Demam Berdarah Dengue (DBD) / Dengue Fever, Ch...","Berdasarkan informasi yang diberikan, saya aka...","Berdasarkan gejala yang disampaikan, yaitu dem...",Berdasarkan keluhan yang disampaikan oleh pasi...,Berdasarkan keluhan pasien yang meliputi demam...,Berdasarkan gejala yang Anda sebutkan—demam ≥6...
3,4.0,Pasien menderita demam disertai munculnya brun...,"Infeksi Bakteri/Virus (Sistemik) 50%, Infeksi ...","DBD, Infeksi Bakteri/Virus (Sistemik), Infeksi...","Demam Biasa, Dispepsia, Asma, Infeksi Lokal, A...","Infeksi Kulit (Dermatitis atau Selulitis), Kej...","Infeksi Bakteri (Selulitis), Reaksi Alergi (De...",Erisipelas (Infeksi Bakteri pada Kulit dan Jar...,"Berdasarkan informasi klinis yang diberikan, s...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan informasi yang diberikan, pasien m...",### Analisis Gejala dan Kemungkinan Diagnosis\...,Berdasarkan gejala dan temuan pemeriksaan yang...
4,5.0,Pasien nyeri dada sejak 4 jam lalu seperti ter...,"Angina Pekrotis 80%, Infark Miokard Akut (Sera...","Angina Pekrotis, Infark Miokard Akut (Serangan...","Infark Miokard Akut (Serangan Jantung), Dispep...","Angina Pekrotis, Infark Miokard, Penyakit Musk...","Angina Pectoris (Kardiovaskular), Infark Mioka...","Infark Miokard Akut (Serangan Jantung), Angina...",Berdasarkan informasi yang diberikan dalam kon...,"Berdasarkan gejala yang disampaikan, yaitu nye...","Berdasarkan informasi yang diberikan, pasien m...",### Analisis Gejala\r\n\r\nPasien mengeluhkan ...,Berdasarkan gejala nyeri dada seperti terhimpi...


In [4]:
with open('kamus.json', 'r') as file:
    SYNONYM_MAP = json.load(file)
    
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    for abbr, canonical in SYNONYM_MAP.items():
        abb = abbr.lower()
        canon = canonical.lower()

        if (text == canon): break
        pattern = rf'\b{re.escape(abb)}\b'
        if re.search(pattern, text):
            text = re.sub(pattern, canon, text)
            break
        
    return text

In [5]:
import time

def llm_judge_answer(question, candidate_answer:str, gt_answers:str, model_name="gpt-3.5-turbo", max_retries=3):
    for attempt in range(max_retries):
        try:
            if attempt > 0:
                delay = 2 ** attempt  
                print(f"Retrying after {delay} seconds...")
                time.sleep(delay)
            
            normalized_candidate = normalize_text(candidate_answer)
            ground_truth_text = normalize_text(gt_answers)

            prompt = f"""
            You are a medical expert evaluating diagnosis answers. Please judge whether the candidate answer is appropriate, based on the ground truth diagnoses.

            Question: {question}
            Ground Truth Answers: {ground_truth_text}
            Candidate Answer: {normalized_candidate}

            Evaluation Instructions:
            1. Carefully read all the ground truth diagnoses and note the probability (%) associated with each. These percentages represent subjective probabilities assigned by a human doctor.
            2. Read the candidate answer(s). For each, determine whether it is:
            a. Correct and matches one of the ground truth diagnoses  
            b. Partially correct or related (e.g., a broader category, symptom-level, or similar condition)  
            c. Incorrect or unrelated

            3. If the candidate answer matches any ground truth diagnosis exactly, score it only based on the match and the probability assigned to that diagnosis. 
            - A match with a high-probability diagnosis (e.g., 70%) should result in a high score (close to 1).
            - A match with a low-probability diagnosis (e.g., 10%) should still be credited, but with a lower score.

            4. If the candidate answer is only partially related (e.g., a plausible but incorrect differential diagnosis), score accordingly with a moderate value.

            5. If the answer is incorrect or unrelated, assign a low score close to 0.

            6. Very important: Do NOT penalize the candidate for not mentioning all ground truth diagnoses.  
            If the candidate matches the top diagnosis (the one with the highest percentage), give it a high score (e.g., 0.9 to 1.0), even if other diagnoses are not mentioned.  
            Only score lower if the top diagnosis is not mentioned or the answer is incorrect.
            You are only evaluating the correctness and relevance of the candidate diagnosis, not how exhaustive it is.

            7. Important: If the candidate includes the top diagnosis (the one with the highest percentage), and also includes unrelated or incorrect diagnoses, still assign a high score. 
            Do not penalize the answer just because it includes extra incorrect diagnoses, as long as the top correct one is clearly present.

            Examples:
            - Candidate 1: 'Demam tifoid' → GT: 'Demam Tifoid 70%, Demam DBD 30%' → Score above 0.9 (correct match to top GT)
            - Candidate 2: 'Demam tifoid, demam biasa' → GT: same → Score above 0.8, lower than candidate 1 (top GT match + noise)
            - Candidate 3: 'Demam DBD, demam biasa' → GT: same → Score above 0.6, lower than candidate 2 (match to lower GT + noise)
            - Candidate 4: 'Demam biasa' → GT: same → Score: 0.2 (unrelated)

            Respond with a JSON object in the following format:
            {{
            "score": <float score from 0 to 1>,
            "reasoning": "<brief explanation of your judgment>",
            "category": "<correct|partial|incorrect>"
            }}

            Response:
            """

            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": "You are a medical expert evaluating diagnosis accuracy."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=200
            )

            response_text = response.choices[0].message.content
            # print(response_text)
            try:
                start_idx = response_text.find('{')
                end_idx = response_text.rfind('}') + 1
                json_str = response_text[start_idx:end_idx]
                
                result = json.loads(json_str)
                if 'score' in result and 'reasoning' in result and 'category' in result:
                    return result
                else:
                    raise ValueError("Invalid response structure")
                    
            except Exception as parse_error:
                print(f"Parse error on attempt {attempt + 1}: {parse_error}")
                if attempt == max_retries - 1:
                    return {
                        "score": 0.5,
                        "reasoning": "Could not parse LLM response after multiple attempts",
                        "category": "partial"
                    }
                continue
                
        except Exception as e:
            print(f"API error on attempt {attempt + 1}: {e}")
            if "insufficient_quota" in str(e) or "quota" in str(e).lower():
                print("⚠️  OpenAI API quota exceeded. Please check your billing.")
                return {
                    "score": 0.0,
                    "reasoning": "API quota exceeded",
                    "category": "error"
                }
            elif attempt == max_retries - 1:
                return {
                    "score": 0.0,
                    "reasoning": f"API error after {max_retries} attempts: {str(e)}",
                    "category": "error"
                }
            continue

    return {
        "score": 0.0,
        "reasoning": "Unknown error occurred",
        "category": "error"
    }

In [6]:
sample_question = "Pasien mengalami demam pada waktu malam"
sample_candidate = "Demam berdarah, demam tifoid, demam biasa"
sample_gt = "Demam Tifoid 70%, Demam Berdarah Dengue 30%"

# test_result = llm_judge_answer(sample_question, sample_candidate, sample_gt)
# print(f"Test Result 3.5: {test_result}")
test_result = llm_judge_answer(sample_question, sample_candidate, sample_gt, model_name='gpt-4-turbo')
print(f"Test Result 4: {test_result}")

Test Result 4: {'score': 0.8, 'reasoning': "The candidate answer includes 'demam berdarah dengue' and 'demam tifoid', both of which are present in the ground truth diagnoses. However, it also includes 'demam biasa', which is not specified in the ground truth. This additional diagnosis is more general and not directly related to the specific conditions listed in the ground truth. Therefore, while the answer correctly identifies two specific conditions, the inclusion of a less relevant diagnosis slightly lowers the score.", 'category': 'partial'}


In [None]:
sample_question = "Pasien mengalami demam pada waktu malam"
sample_candidate = "Demam tifoid, demam biasa"
sample_gt = "Demam Tifoid 70%, Demam Berdarah Dengue 30%"

# test_result = llm_judge_answer(sample_question, sample_candidate, sample_gt)
# print(f"Test Result 3.5: {test_result}")
test_result = llm_judge_answer(sample_question, sample_candidate, sample_gt, model_name='gpt-4-turbo')
print(f"Test Result 4: {test_result}")

Test Result 4: {'score': 0.8, 'reasoning': "The candidate answer includes 'demam tifoid' which is a correct match to the ground truth diagnoses. However, it also includes 'common fever' which is not mentioned in the ground truth diagnoses, hence it is considered as noise.", 'category': 'partial'}


In [7]:
def find_all_round_judgement(candidate_answers, question, gt_answers, model_name):
    if not candidate_answers or not gt_answers:
        return None, 0.0, {}
        
    cands = ",".join(candidate_answers)
    all_judge = llm_judge_answer(question, cands, gt_answers, model_name)
    
    return all_judge

In [8]:
def process_row(row):
    print(f"Processing row {row.name}...")
    # truth_list = [item.strip() for item in row['Dr Answer'].split(',')]
    model_cols = {
        'Claude Answer': 'LLM_JUDGE_Claude',
        'Qwen Answer': 'LLM_JUDGE_Qwen',
        'GPT Answer': 'LLM_JUDGE_GPT',
        'Deepseek RAG Answer': 'LLM_JUDGE_Deepseek_RAG',
        'Deepseek non RAG Answer': 'LLM_JUDGE_Deepseek_nonRAG'
    }
    # gpt-3.5-turbo
    # 'gpt-4-0613'
    for col_name, result_col in tqdm(model_cols.items(), desc='Processing models'):
        model_list = [item.strip() for item in row[col_name].split(',')]
        result = find_all_round_judgement(model_list, row['Question'], row['Dr Answer'], model_name='gpt-4-turbo')

        row[f'{result_col}_score'] = result.get('score', 0.0)
        row[f'{result_col}_reasoning'] = result.get('reasoning', 0.0)
        row[f'{result_col}_category'] = result.get('category', 0.0)
    
    return row

In [9]:
df_new = df.copy()
df_new

Unnamed: 0,No,Question,Dr Answer,Claude Answer,Qwen Answer,GPT Answer,Deepseek RAG Answer,Deepseek non RAG Answer,Claude Full Answer,Qwen Full Answer,GPT Full Answer,Deepseek RAG Full Answer,Deepseek non RAG Full Answer
0,1.0,Pasien mengalami demam pada waktu malam. Sebel...,"Demam Tifoid 70%, Demam Berdarah Dengue 30%","Demam Tifoid, Leptospirosis, Demam Berdarah De...","Demam Biasa, Dispepsia, Asma, Infeksi Saluran ...","Demam Biasa, Demam Tifoid, Demam Berdarah Deng...","Demam Tifoid (Typhoid Fever), Malaria, Leptosp...","Malaria, Infeksi Saluran Kemih (ISK) atau Piel...","Berdasarkan gejala yang disampaikan, saya akan...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan gejala yang dialami pasien, yaitu ...",Berdasarkan gejala yang Anda sebutkan—demam pa...
1,2.0,Pasien mengalami buang air besar cair lebih da...,"Gastroenteritis (GE) Akut 70%, Intoleransi Lak...","Gastroenteritis (GE) Akut, Disentri, Infeksi E...","Diare Akut, Gastroenteritis, Dispepsia, Infeks...","Gastroenteritis Akut, Infeksi Escherichia coli...","Gastroenteritis Akut, Keracunan Makanan, Infek...","Gastroenteritis Akut, Keracunan Makanan, Koler...","Berdasarkan informasi klinis yang diberikan, s...","Berdasarkan gejala yang disampaikan, yaitu bua...","Berdasarkan informasi yang diberikan, pasien m...","Berdasarkan informasi yang diberikan, pasien m...",Berdasarkan gejala yang Anda deskripsikan—diar...
2,3.0,Pasien datang dengan keluhan demam selama 6 ha...,DBD 80%,"Demam Tifoid, Malaria, DBD, Leptospirosis, Inf...","Demam Biasa, Dispepsia, Asma, Demam Dengue, Lu...","Demam Berdarah Dengue, Demam Tifoid, Malaria, ...","Demam Berdarah Dengue (DBD), Demam Tifoid, Lep...","Demam Berdarah Dengue (DBD) / Dengue Fever, Ch...","Berdasarkan informasi yang diberikan, saya aka...","Berdasarkan gejala yang disampaikan, yaitu dem...",Berdasarkan keluhan yang disampaikan oleh pasi...,Berdasarkan keluhan pasien yang meliputi demam...,Berdasarkan gejala yang Anda sebutkan—demam ≥6...
3,4.0,Pasien menderita demam disertai munculnya brun...,"Infeksi Bakteri/Virus (Sistemik) 50%, Infeksi ...","DBD, Infeksi Bakteri/Virus (Sistemik), Infeksi...","Demam Biasa, Dispepsia, Asma, Infeksi Lokal, A...","Infeksi Kulit (Dermatitis atau Selulitis), Kej...","Infeksi Bakteri (Selulitis), Reaksi Alergi (De...",Erisipelas (Infeksi Bakteri pada Kulit dan Jar...,"Berdasarkan informasi klinis yang diberikan, s...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan informasi yang diberikan, pasien m...",### Analisis Gejala dan Kemungkinan Diagnosis\...,Berdasarkan gejala dan temuan pemeriksaan yang...
4,5.0,Pasien nyeri dada sejak 4 jam lalu seperti ter...,"Angina Pekrotis 80%, Infark Miokard Akut (Sera...","Angina Pekrotis, Infark Miokard Akut (Serangan...","Infark Miokard Akut (Serangan Jantung), Dispep...","Angina Pekrotis, Infark Miokard, Penyakit Musk...","Angina Pectoris (Kardiovaskular), Infark Mioka...","Infark Miokard Akut (Serangan Jantung), Angina...",Berdasarkan informasi yang diberikan dalam kon...,"Berdasarkan gejala yang disampaikan, yaitu nye...","Berdasarkan informasi yang diberikan, pasien m...",### Analisis Gejala\r\n\r\nPasien mengeluhkan ...,Berdasarkan gejala nyeri dada seperti terhimpi...
5,6.0,Pasien mengalami sesak nafas sejak siang. #Has...,"Asma 100%, Bronkitis Akut 70%, Penyakit Paru O...","Asma, Bronkitis Akut, Pnemonia, Penyakit Paru ...","Asma, Pneumonia, Dispepsia, Demam biasa, Kardi...","Asma, Penyakit Paru Obstruktif Kronis, Pnemoni...","Asma, Bronkitis Akut, Pneumotoraks, Gagal Jan...","Asma Akut Eksaserbasi, PPOK Eksaserbasi Akut...",Berdasarkan informasi klinis yang diberikan da...,"Berdasarkan gejala yang disampaikan, yaitu ses...","Berdasarkan informasi yang diberikan, pasien m...",Berdasarkan gejala dan hasil pemeriksaan yang ...,Berdasarkan gejala sesak napas disertai wheezi...
6,7.0,Pasien mengalami sesak napas sejak sehari lalu...,Bronkitis Akut 100%,"Pneumonia Komunitas, Bronkitis Akut, Asma, Inf...","Asma, Infeksi Saluran Pernapasan Atas (ISPA), ...","ISPA, Bronkitis Akut, Asma, Pneumonia, Diabete...","Infeksi Saluran Pernapasan Akut (ISPA), Bronki...",Infeksi Saluran Pernapasan Akut (ISPA) dengan ...,Berdasarkan informasi gejala dan pemeriksaan f...,Berdasarkan gejala yang disampaikan oleh pasie...,"Berdasarkan gejala yang disampaikan, pasien me...","""Berdasarkan gejala yang dialami pasien, yaitu...","Berdasarkan gejala dan hasil pemeriksaan, pasi..."
7,8.0,"Pasien mengalami sesak nafas sekitar 3 hari, t...","Bronkitis Akut 80%, Asma Bronkial 50%","Bronkitis Akut, Pneumonia , Asma Bronkial, Kar...","Asma, Dispepsia, Pneumonia, Anemia, Kardiomiopati","Asma, Pneumonia , Penyakit Paru Obstruktif Kro...","Asma, Pneumonia, Emboli Paru, Gagal Jantung Ak...","Asma (Tipe Non-Batuk), Pneumonia Atipik (Misal...",Berdasarkan informasi yang tersedia dalam kont...,"Berdasarkan gejala yang disampaikan, yaitu ses...","Berdasarkan informasi yang diberikan, pasien m...",""" Berdasarkan informasi yang diberikan, pasien...",Berdasarkan gejala sesak napas selama 3 hari t...
8,9.0,Pasien nyeri ulu hati/perut bagian atas sejak ...,"Dispepsia 100%, Pneumonia 50%, Infeksi Saluran...","Dispepsia, Pneumonia, Infeksi Saluran Pernapas...","Dispepsia, Demam Biasa, Asma, Infeksi saluran ...","Dispepsia, Demam Biasa, Gastroenteritis, Gastr...","Dispepsia, Gastroenteritis, Pneumonia, Gastrit...","Gastroenteritis Akut dengan Refluks Asam/GERD,...","Berdasarkan informasi gejala yang disampaikan,...",Berdasarkan gejala yang disampaikan oleh pasie...,Berdasarkan gejala yang disampaikan oleh pasie...,"""Berikut adalah analisis gejala yang dialami p...","Berdasarkan gejala nyeri ulu hati, mual-muntah..."
9,10.0,Pasien merasa nyeri di ulu hati sampai terasa ...,"Dispepsia 80%, refluk Gastroesofagus (GERD) 90%","Dispepsia, Gastritis Akut, Ulkus Peptikum, Ape...","Dispepsia Fungsional, Gastritis, Reflux Asam L...","Dispepsia, Gastritis Akut, Ulser Peptikum, Pen...","Dispepsia, Gastroesophageal Reflux Disease (GE...","Gastritis Akut, GERD (Refluks Asam Lambung), U...","Berdasarkan gejala yang Anda sebutkan, berikut...",Berdasarkan gejala yang disampaikan oleh pasie...,"Berdasarkan gejala yang disampaikan, pasien me...","""Berdasarkan gejala yang dialami pasien, yaitu...",Berdasarkan gejala nyeri ulu hati (epigastrium...


In [11]:
df_new = df_new.apply(process_row, axis=1)

Processing row 0...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.78s/it]


Processing row 1...


Processing models: 100%|██████████| 5/5 [00:20<00:00,  4.18s/it]


Processing row 2...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.81s/it]


Processing row 3...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.85s/it]


Processing row 4...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.83s/it]


Processing row 5...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.72s/it]


Processing row 6...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.27s/it]


Processing row 7...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.43s/it]


Processing row 8...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.50s/it]


Processing row 9...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.92s/it]


Processing row 10...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.68s/it]


Processing row 11...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.83s/it]


Processing row 12...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.83s/it]


Processing row 13...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.40s/it]


Processing row 14...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.79s/it]


Processing row 15...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.46s/it]


Processing row 16...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it]


Processing row 17...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.73s/it]


Processing row 18...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.33s/it]


Processing row 19...


Processing models: 100%|██████████| 5/5 [00:15<00:00,  3.11s/it]


Processing row 20...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]


Processing row 21...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.25s/it]


Processing row 22...


Processing models: 100%|██████████| 5/5 [00:20<00:00,  4.08s/it]


Processing row 23...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.65s/it]


Processing row 24...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.53s/it]


Processing row 25...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.42s/it]


Processing row 26...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.25s/it]


Processing row 27...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.56s/it]


Processing row 28...


Processing models: 100%|██████████| 5/5 [00:13<00:00,  2.71s/it]


Processing row 29...


Processing models: 100%|██████████| 5/5 [00:15<00:00,  3.00s/it]


Processing row 30...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]


Processing row 31...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.49s/it]


Processing row 32...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.82s/it]


Processing row 33...


Processing models: 100%|██████████| 5/5 [00:15<00:00,  3.13s/it]


Processing row 34...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.60s/it]


Processing row 35...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it]


Processing row 36...


Processing models:  60%|██████    | 3/5 [00:10<00:06,  3.49s/it]

Parse error on attempt 1: Expecting value: line 1 column 1 (char 0)
Retrying after 2 seconds...


Processing models: 100%|██████████| 5/5 [00:25<00:00,  5.10s/it]


Processing row 37...


Processing models: 100%|██████████| 5/5 [00:19<00:00,  3.87s/it]


Processing row 38...


Processing models: 100%|██████████| 5/5 [00:21<00:00,  4.38s/it]


Processing row 39...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.26s/it]


Processing row 40...


Processing models: 100%|██████████| 5/5 [00:17<00:00,  3.54s/it]


Processing row 41...


Processing models: 100%|██████████| 5/5 [00:21<00:00,  4.36s/it]


Processing row 42...


Processing models: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it]


Processing row 43...


Processing models: 100%|██████████| 5/5 [00:18<00:00,  3.63s/it]


Processing row 44...


Processing models: 100%|██████████| 5/5 [00:16<00:00,  3.26s/it]


In [13]:
df_new.describe()

Unnamed: 0,No,LLM_JUDGE_Claude_score,LLM_JUDGE_Qwen_score,LLM_JUDGE_GPT_score,LLM_JUDGE_Deepseek_RAG_score,LLM_JUDGE_Deepseek_nonRAG_score
count,45.0,45.0,45.0,45.0,45.0,45.0
mean,23.0,0.858889,0.628889,0.757778,0.771111,0.686667
std,13.133926,0.079264,0.287325,0.194806,0.209569,0.293567
min,1.0,0.7,0.0,0.1,0.0,0.0
25%,12.0,0.8,0.7,0.7,0.7,0.7
50%,23.0,0.9,0.7,0.8,0.8,0.8
75%,34.0,0.9,0.8,0.9,0.9,0.9
max,45.0,1.0,0.9,0.9,1.0,1.0
