In [2]:
import pandas as pd
import numpy as np
import json
import os
import re
import warnings
import openai
from dotenv import load_dotenv
warnings.filterwarnings('ignore')
from tqdm import tqdm

In [3]:
load_dotenv()
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY2"))

In [4]:
df = pd.read_csv('../cleaning_before_eval/ready_to_eval.csv')
df.head()

Unnamed: 0,No,Question,Dr Answer,Claude Answer,Qwen Answer,GPT Answer,Deepseek RAG Answer,Deepseek non RAG Answer,Claude Full Answer,Qwen Full Answer,GPT Full Answer,Deepseek RAG Full Answer,Deepseek non RAG Full Answer
0,1.0,Pasien mengalami demam pada waktu malam. Sebel...,"Demam Tifoid 70%, Demam Berdarah Dengue 30%","Demam Tifoid, Leptospirosis, Demam Berdarah De...","Demam Biasa, Dispepsia, Asma, Infeksi Saluran ...","Demam Biasa, Demam Tifoid, Demam Berdarah Deng...","Demam Tifoid (Typhoid Fever), Malaria, Leptosp...","Malaria, Infeksi Saluran Kemih (ISK) atau Piel...","Berdasarkan gejala yang disampaikan, saya akan...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan gejala yang dialami pasien, yaitu ...",Berdasarkan gejala yang Anda sebutkan—demam pa...
1,2.0,Pasien mengalami buang air besar cair lebih da...,"Gastroenteritis (GE) Akut 70%, Intoleransi Lak...","Gastroenteritis (GE) Akut, Disentri, Infeksi E...","Diare Akut, Gastroenteritis, Dispepsia, Infeks...","Gastroenteritis Akut, Infeksi Escherichia coli...","Gastroenteritis Akut, Keracunan Makanan, Infek...","Gastroenteritis Akut, Keracunan Makanan, Koler...","Berdasarkan informasi klinis yang diberikan, s...","Berdasarkan gejala yang disampaikan, yaitu bua...","Berdasarkan informasi yang diberikan, pasien m...","Berdasarkan informasi yang diberikan, pasien m...",Berdasarkan gejala yang Anda deskripsikan—diar...
2,3.0,Pasien datang dengan keluhan demam selama 6 ha...,DBD 80%,"Demam Tifoid, Malaria, DBD, Leptospirosis, Inf...","Demam Biasa, Dispepsia, Asma, Demam Dengue, Lu...","Demam Berdarah Dengue, Demam Tifoid, Malaria, ...","Demam Berdarah Dengue (DBD), Demam Tifoid, Lep...","Demam Berdarah Dengue (DBD) / Dengue Fever, Ch...","Berdasarkan informasi yang diberikan, saya aka...","Berdasarkan gejala yang disampaikan, yaitu dem...",Berdasarkan keluhan yang disampaikan oleh pasi...,Berdasarkan keluhan pasien yang meliputi demam...,Berdasarkan gejala yang Anda sebutkan—demam ≥6...
3,4.0,Pasien menderita demam disertai munculnya brun...,"Infeksi Bakteri/Virus (Sistemik) 50%, Infeksi ...","DBD, Infeksi Bakteri/Virus (Sistemik), Infeksi...","Demam Biasa, Dispepsia, Asma, Infeksi Lokal, A...","Infeksi Kulit (Dermatitis atau Selulitis), Kej...","Infeksi Bakteri (Selulitis), Reaksi Alergi (De...",Erisipelas (Infeksi Bakteri pada Kulit dan Jar...,"Berdasarkan informasi klinis yang diberikan, s...","Berdasarkan gejala yang disampaikan, yaitu dem...","Berdasarkan informasi yang diberikan, pasien m...",### Analisis Gejala dan Kemungkinan Diagnosis\...,Berdasarkan gejala dan temuan pemeriksaan yang...
4,5.0,Pasien nyeri dada sejak 4 jam lalu seperti ter...,"Angina Pekrotis 80%, Infark Miokard Akut (Sera...","Angina Pekrotis, Infark Miokard Akut (Serangan...","Infark Miokard Akut (Serangan Jantung), Dispep...","Angina Pekrotis, Infark Miokard, Penyakit Musk...","Angina Pectoris (Kardiovaskular), Infark Mioka...","Infark Miokard Akut (Serangan Jantung), Angina...",Berdasarkan informasi yang diberikan dalam kon...,"Berdasarkan gejala yang disampaikan, yaitu nye...","Berdasarkan informasi yang diberikan, pasien m...",### Analisis Gejala\r\n\r\nPasien mengeluhkan ...,Berdasarkan gejala nyeri dada seperti terhimpi...


In [5]:
with open('kamus.json', 'r') as file:
    SYNONYM_MAP = json.load(file)
    
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    for abbr, canonical in SYNONYM_MAP.items():
        abb = abbr.lower()
        canon = canonical.lower()

        if (text == canon): break
        pattern = rf'\b{re.escape(abb)}\b'
        if re.search(pattern, text):
            text = re.sub(pattern, canon, text)
            break
        
    return text

In [10]:
import time

def llm_judge_answer(question, candidate_answer:str, gt_answers:str, model_name="gpt-3.5-turbo", max_retries=3):
    for attempt in range(max_retries):
        try:
            if attempt > 0:
                delay = 2 ** attempt  
                print(f"Retrying after {delay} seconds...")
                time.sleep(delay)
            
            normalized_candidate = normalize_text(candidate_answer)
            ground_truth_text = normalize_text(gt_answers)

            prompt = f"""
            You are a medical expert evaluating diagnosis answers. Please judge whether the candidate answer is appropriate, based on the ground truth diagnoses.

            Question: {question}
            Ground Truth Answers: {ground_truth_text}
            Candidate Answer: {normalized_candidate}

            Evaluation Instructions:
            1. Carefully read all the ground truth diagnoses and note the probability (%) associated with each. These percentages represent subjective probabilities assigned by a human doctor.
            2. Read the candidate answer(s). For each, determine whether it is:
            a. Correct and matches one of the ground truth diagnoses  
            b. Partially correct or related (e.g., a broader category, symptom-level, or similar condition)  
            c. Incorrect or unrelated

            3. If the candidate answer matches any ground truth diagnosis exactly, score it only based on the match and the probability assigned to that diagnosis. 
            - A match with a high-probability diagnosis (e.g., 70%) should result in a high score (close to 1).
            - A match with a low-probability diagnosis (e.g., 10%) should still be credited, but with a lower score.

            4. If the candidate answer is only partially related (e.g., a plausible but incorrect differential diagnosis), score accordingly with a moderate value.

            5. If the answer is incorrect or unrelated, assign a low score close to 0.

            6. Very important: Do NOT penalize the candidate for not mentioning all ground truth diagnoses.  
            If the candidate matches the top diagnosis (the one with the highest percentage), give it a high score (e.g., 0.9 to 1.0), even if other diagnoses are not mentioned.  
            Only score lower if the top diagnosis is not mentioned or the answer is incorrect.

            You are only evaluating the correctness and relevance of the candidate diagnosis, not how exhaustive it is.


            Respond with a JSON object in the following format:
            {{
            "score": <float score from 0 to 1>,
            "reasoning": "<brief explanation of your judgment>",
            "category": "<correct|partial|incorrect>"
            }}

            Response:
            """

            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": "You are a medical expert evaluating diagnosis accuracy."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=200
            )

            response_text = response.choices[0].message.content
            # print(response_text)
            try:
                start_idx = response_text.find('{')
                end_idx = response_text.rfind('}') + 1
                json_str = response_text[start_idx:end_idx]
                
                result = json.loads(json_str)
                if 'score' in result and 'reasoning' in result and 'category' in result:
                    return result
                else:
                    raise ValueError("Invalid response structure")
                    
            except Exception as parse_error:
                print(f"Parse error on attempt {attempt + 1}: {parse_error}")
                if attempt == max_retries - 1:
                    return {
                        "score": 0.5,
                        "reasoning": "Could not parse LLM response after multiple attempts",
                        "category": "partial"
                    }
                continue
                
        except Exception as e:
            print(f"API error on attempt {attempt + 1}: {e}")
            if "insufficient_quota" in str(e) or "quota" in str(e).lower():
                print("⚠️  OpenAI API quota exceeded. Please check your billing.")
                return {
                    "score": 0.0,
                    "reasoning": "API quota exceeded",
                    "category": "error"
                }
            elif attempt == max_retries - 1:
                return {
                    "score": 0.0,
                    "reasoning": f"API error after {max_retries} attempts: {str(e)}",
                    "category": "error"
                }
            continue

    return {
        "score": 0.0,
        "reasoning": "Unknown error occurred",
        "category": "error"
    }

In [14]:
sample_question = "Pasien mengalami demam pada waktu malam"
sample_candidate = "Demam berdarah, demam tifoid, demam biasa"
sample_gt = "Demam Tifoid 70%, Demam Berdarah Dengue 30%"

test_result = llm_judge_answer(sample_question, sample_candidate, sample_gt)
print(f"Test Result 3.5: {test_result}")
test_result = llm_judge_answer(sample_question, sample_candidate, sample_gt, model_name='gpt-4-0613')
print(f"Test Result 4: {test_result}")


Test Result 3.5: {'score': 0.9, 'reasoning': "The candidate answer correctly matches 'demam berdarah dengue' which is one of the ground truth diagnoses with a high probability.", 'category': 'correct'}
Test Result 4: {'score': 0.9, 'reasoning': "The candidate answer correctly identifies two of the ground truth diagnoses: 'demam berdarah dengue' and 'demam tifoid'. However, it also includes 'demam biasa' which is not part of the ground truth diagnoses.", 'category': 'partial'}


In [7]:
def find_score_llm_judge(candidate_answers, question, gt_answers, model_name):
    if not candidate_answers or not gt_answers:
        return None, 0.0, {}
    
    best_score = 0.0
    average_score = 0.0
    
    for candidate in tqdm(candidate_answers, desc=f'Processing cands {model_name}'):
        judgment = llm_judge_answer(question, candidate, gt_answers, model_name)
        score = judgment.get('score', 0.0)
        average_score += score
        
        if score > best_score:
            best_score = score
    
    cands = ",".join(candidate_answers)
    all_round_score = llm_judge_answer(question, cands, gt_answers, model_name).get('score', 0.0)
    
    return (average_score/len(candidate_answers)), best_score, all_round_score

In [8]:
def find_judgement(candidate_answers, question, gt_answers, model_name):
    if not candidate_answers or not gt_answers:
        return None, 0.0, {}
    
    best_score = 0.0
    average_score = 0.0
    
    for candidate in tqdm(candidate_answers, desc=f'Processing cands {model_name}'):
        judgment = llm_judge_answer(question, candidate, gt_answers, model_name)
        print(f"Judgement {candidate}:", judgment)
        score = judgment.get('score', 0.0)
        average_score += score
        
        if score > best_score:
            best_score = score
    
    cands = ",".join(candidate_answers)
    all_judge = llm_judge_answer(question, cands, gt_answers, model_name)
    all_round_score = all_judge.get('score', 0.0)
    print(f'All judge: {all_judge}')
    
    return (average_score/len(candidate_answers)), best_score, all_round_score

In [14]:
cands = [item.strip() for item in df['Claude Answer'][0].split(',')]
gt = df['Dr Answer'][0]
q = df['Question'][0]
# cands
hasil1 = find_judgement(cands, q, gt, 'gpt-4-0613')
print('================================================')
hasil2 = find_judgement(cands, q, gt, 'gpt-3.5-turbo')

Processing cands gpt-4-0613:  20%|██        | 1/5 [00:03<00:13,  3.37s/it]

Judgement Demam Tifoid: {'score': 0.33, 'reasoning': "The candidate answer only mentions 'demam tifoid' which is one of the possible diagnoses listed in the ground truth answers. However, it does not mention 'demam berdarah' or 'demam berdarah dengue' which are also potential diagnoses.", 'category': 'partial'}


Processing cands gpt-4-0613:  40%|████      | 2/5 [00:07<00:11,  3.71s/it]

Judgement Leptospirosis: {'score': 0.0, 'reasoning': "The candidate answer of 'leptospirosis' does not match the ground truth answers of 'demam tifoid', 'demam berdarah', or 'demam berdarah dengue'. While all are diseases that can cause fever, nausea, and chills, they are distinct conditions with different causes and treatments.", 'category': 'incorrect'}


Processing cands gpt-4-0613:  60%|██████    | 3/5 [00:09<00:06,  3.24s/it]

Judgement Demam Berdarah Dengue: {'score': 0.33, 'reasoning': 'The candidate answer is partially correct. The symptoms described could be indicative of several diseases, including dengue fever, typhoid fever, and dengue hemorrhagic fever. The candidate answer only mentions dengue hemorrhagic fever, which is one of the possible diagnoses.', 'category': 'partial'}


Processing cands gpt-4-0613:  80%|████████  | 4/5 [00:13<00:03,  3.38s/it]

Judgement Hepatitis A: {'score': 0, 'reasoning': "The candidate answer of 'hepatitis a' does not match the ground truth answers of 'demam tifoid', 'demam berdarah', or 'demam berdarah dengue'. The symptoms described in the question are more consistent with the ground truth answers than with hepatitis A.", 'category': 'incorrect'}


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.45s/it]

Judgement Infeksi Saluran Kemih: {'score': 0, 'reasoning': "The candidate answer 'isk' does not match any of the ground truth answers 'demam tifoid', 'demam berdarah', or 'demam berdarah dengue'. The symptoms described could be indicative of these diseases, but 'isk' is not related to these diagnoses.", 'category': 'incorrect'}





All judge: {'score': 0.4, 'reasoning': "The candidate answer includes 'demam tifoid' and 'demam berdarah dengue' which are correct and match the ground truth. However, it also includes 'leptospirosis', 'hepatitis a', and 'infeksi saluran kemih' which are not mentioned in the ground truth answers.", 'category': 'partial'}


Processing cands gpt-3.5-turbo:  20%|██        | 1/5 [00:00<00:02,  1.42it/s]

Judgement Demam Tifoid: {'score': 1, 'reasoning': "The candidate answer 'demam tifoid' matches one of the ground truth answers 'demam tifoid'.", 'category': 'correct'}


Processing cands gpt-3.5-turbo:  40%|████      | 2/5 [00:01<00:02,  1.13it/s]

Judgement Leptospirosis: {'score': 0, 'reasoning': "The candidate answer 'leptospirosis' does not match any of the ground truth answers provided (demam tifoid, demam berdarah, demam berdarah dengue). Leptospirosis is not a common diagnosis for the symptoms described.", 'category': 'incorrect'}


Processing cands gpt-3.5-turbo:  60%|██████    | 3/5 [00:02<00:02,  1.06s/it]

Judgement Demam Berdarah Dengue: {'score': 1, 'reasoning': "The candidate answer 'demam berdarah dengue' perfectly matches the ground truth answers 'demam berdarah dengue'.", 'category': 'correct'}


Processing cands gpt-3.5-turbo:  80%|████████  | 4/5 [00:03<00:00,  1.02it/s]

Judgement Hepatitis A: {'score': 0, 'reasoning': 'Hepatitis A does not match the symptoms described in the question, which are more indicative of dengue fever or typhoid fever.', 'category': 'incorrect'}


Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]

Judgement Infeksi Saluran Kemih: {'score': 0, 'reasoning': "The candidate answer 'isk' is incorrect and unrelated to the symptoms described in the question.", 'category': 'incorrect'}





All judge: {'score': 0.33, 'reasoning': 'The candidate answer includes demam tifoid and demam berdarah dengue, which match with the ground truth. However, the other diagnoses such as leptospirosis, hepatitis A, and urinary tract infection are unrelated to the symptoms described.', 'category': 'partial'}


In [15]:
gt

'Demam Tifoid 70%, Demam Berdarah Dengue 30%'

In [8]:
def process_row(row):
    print(f"Processing row {row.name}...")
    # truth_list = [item.strip() for item in row['Dr Answer'].split(',')]
    model_cols = {
        'Claude Answer': 'LLM_JUDGE_Claude',
        'Qwen Answer': 'LLM_JUDGE_Qwen',
        'GPT Answer': 'LLM_JUDGE_GPT',
        'Deepseek RAG Answer': 'LLM_JUDGE_Deepseek_RAG',
        'Deepseek non RAG Answer': 'LLM_JUDGE_Deepseek_nonRAG'
    }
    # gpt-3.5-turbo
    # 'gpt-4-0613'
    for col_name, result_col in tqdm(model_cols.items(), desc='Processing models'):
        model_list = [item.strip() for item in row[col_name].split(',')]
        row[f'{result_col}_avg_4'], row[f'{result_col}_best_4'], \
            row[f'{result_col}_all_4'] = find_score_llm_judge(model_list, row['Question'], row['Dr Answer'], model_name='gpt-4-0613')
        row[f'{result_col}_avg_3.5'], row[f'{result_col}_best_3.5'], \
            row[f'{result_col}_all_3.5'] = find_score_llm_judge(model_list, row['Question'], row['Dr Answer'], model_name='gpt-3.5-turbo')
    
    return row

In [9]:
df2 = df.head(10)
df2 = df2.apply(process_row, axis=1)

Processing row 0...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.49s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.79s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.54s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.46s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 7/7 [00:24<00:00,  3.44s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 7/7 [00:07<00:00,  1.08s/it]
Processing models: 100%|██████████| 5/5 [02:28<00:00, 29.73s/it]


Processing row 1...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.03s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.25s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.22s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:18<00:00,  3.07s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:06<00:00,  1.06s/it]
Processing models: 100%|██████████| 5/5 [02:30<00:00, 30.08s/it]


Processing row 2...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.86s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:18<00:00,  3.00s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:05<00:00,  1.15it/s]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:22<00:00,  3.81s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:05<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:18<00:00,  3.00s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:05<00:00,  1.07it/s]
Processing models: 100%|██████████| 5/5 [02:14<00:00, 26.94s/it]


Processing row 3...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.23s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.64s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:07<00:00,  1.42s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.75s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.87s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.20s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.14s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
Processing models: 100%|██████████| 5/5 [02:31<00:00, 30.27s/it]


Processing row 4...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.23s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.00it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.70s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.29s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.82s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:18<00:00,  3.09s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:06<00:00,  1.07s/it]
Processing models: 100%|██████████| 5/5 [02:11<00:00, 26.34s/it]


Processing row 5...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.84s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.27s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.47s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.26s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.56s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
Processing models: 100%|██████████| 5/5 [02:10<00:00, 26.09s/it]


Processing row 6...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.12s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.32s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.11s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing models: 100%|██████████| 5/5 [01:59<00:00, 23.82s/it]


Processing row 7...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.15s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.12it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.97s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.10it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.65s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:19<00:00,  3.19s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:05<00:00,  1.05it/s]
Processing models: 100%|██████████| 5/5 [02:20<00:00, 28.16s/it]


Processing row 8...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.83s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.26s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.12s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.43s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.94s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.24s/it]
Processing models: 100%|██████████| 5/5 [02:13<00:00, 26.74s/it]


Processing row 9...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.60s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.41s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.13s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.38s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.94s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
Processing models: 100%|██████████| 5/5 [02:10<00:00, 26.20s/it]


In [14]:
df3 = df.iloc[10:20]
df3 = df3.apply(process_row, axis=1)

Processing row 10...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.85s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.18s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.18s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.02s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing models: 100%|██████████| 5/5 [02:11<00:00, 26.20s/it]


Processing row 11...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.63s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.80s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.75s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:11<00:00,  3.81s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:03<00:00,  1.10s/it]
Processing models: 100%|██████████| 5/5 [02:18<00:00, 27.66s/it]


Processing row 12...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.98s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.44s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.20s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.63s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:22<00:00,  3.79s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:06<00:00,  1.03s/it]
Processing models: 100%|██████████| 5/5 [02:22<00:00, 28.55s/it]


Processing row 13...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.54s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.69s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.08s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.50s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:16<00:00,  4.15s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]
Processing models: 100%|██████████| 5/5 [02:18<00:00, 27.69s/it]


Processing row 14...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.33s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:08<00:00,  1.61s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.82s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.74s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:13<00:00,  3.48s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:03<00:00,  1.15it/s]
Processing models: 100%|██████████| 5/5 [02:16<00:00, 27.32s/it]


Processing row 15...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.23s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.36s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.09s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.47s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.44s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:24<00:00,  4.92s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]
Processing models: 100%|██████████| 5/5 [02:22<00:00, 28.57s/it]


Processing row 16...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.98s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.41s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.64s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.11it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.14s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.11it/s]
Processing models: 100%|██████████| 5/5 [02:03<00:00, 24.79s/it]


Processing row 17...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.12s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.11s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.28s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.46s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.84s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]
Processing models: 100%|██████████| 5/5 [02:11<00:00, 26.29s/it]


Processing row 18...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.17s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.15it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.81s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.73s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.90s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.11it/s]
Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:13<00:00,  3.37s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]
Processing models: 100%|██████████| 5/5 [01:56<00:00, 23.38s/it]


Processing row 19...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.97s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.35s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.24s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.47s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.10it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.94s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.13s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing models: 100%|██████████| 5/5 [02:04<00:00, 24.97s/it]


In [9]:
df4 = df.iloc[20:30]
df4 = df4.apply(process_row, axis=1)

Processing row 20...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:23<00:00,  4.65s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.28s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.81s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.27s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing cands gpt-4-0613: 100%|██████████| 9/9 [00:29<00:00,  3.29s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 9/9 [00:08<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 2/2 [00:09<00:00,  4.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
Processing models: 100%|██████████| 5/5 [02:39<00:00, 31.90s/it]


Processing row 21...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.42s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.12it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.75s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.16s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.60s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.13it/s]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:11<00:00,  3.86s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:02<00:00,  1.13it/s]
Processing models: 100%|██████████| 5/5 [02:21<00:00, 28.37s/it]


Processing row 22...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [10:18<00:00, 123.80s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.51s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.06s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.93s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:17<00:00,  4.42s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:03<00:00,  1.05it/s]
Processing models: 100%|██████████| 5/5 [12:24<00:00, 148.90s/it]


Processing row 23...


Processing cands gpt-4-0613: 100%|██████████| 7/7 [00:28<00:00,  4.11s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 7/7 [00:06<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.83s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.59s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.20s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.56s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.13s/it]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:23<00:00,  3.91s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:07<00:00,  1.32s/it]
Processing models: 100%|██████████| 5/5 [03:27<00:00, 41.44s/it]


Processing row 24...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.59s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.21s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.96s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:42<00:00,  8.50s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.22s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]
Processing cands gpt-4-0613: 100%|██████████| 2/2 [00:11<00:00,  5.84s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
Processing models: 100%|██████████| 5/5 [02:50<00:00, 34.14s/it]


Processing row 25...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.44s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.75s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.78s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 1/1 [00:03<00:00,  3.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Processing models: 100%|██████████| 5/5 [01:58<00:00, 23.77s/it]


Processing row 26...


Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:08<00:00,  2.13s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.78s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.87s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.00s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.09s/it]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:11<00:00,  3.80s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]
Processing models: 100%|██████████| 5/5 [02:01<00:00, 24.31s/it]


Processing row 27...


Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:22<00:00,  3.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:07<00:00,  1.18s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.29s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.15s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:17<00:00,  4.43s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]
Processing models: 100%|██████████| 5/5 [02:38<00:00, 31.70s/it]


Processing row 28...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.73s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.21it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.50s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.88s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Processing cands gpt-4-0613: 100%|██████████| 1/1 [00:04<00:00,  4.83s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]
Processing cands gpt-4-0613: 100%|██████████| 1/1 [00:02<00:00,  2.91s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Processing models: 100%|██████████| 5/5 [01:35<00:00, 19.20s/it]


Processing row 29...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:10<00:00,  2.13s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:13<00:00,  2.66s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.97s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.96s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.13s/it]
Processing cands gpt-4-0613: 100%|██████████| 4/4 [00:16<00:00,  4.01s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]
Processing models: 100%|██████████| 5/5 [02:07<00:00, 25.57s/it]


In [12]:
df5 = df.iloc[30:]
df5 = df5.apply(process_row, axis=1)

Processing row 30...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:23<00:00,  4.71s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.28s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.20s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:28<00:00,  5.76s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.66s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.13it/s]
Processing cands gpt-4-0613: 100%|██████████| 7/7 [00:24<00:00,  3.47s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 7/7 [00:07<00:00,  1.05s/it]
Processing models: 100%|██████████| 5/5 [02:51<00:00, 34.22s/it]


Processing row 31...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.09s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:23<00:00,  4.61s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:26<00:00,  5.20s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.81s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]
Processing cands gpt-4-0613: 100%|██████████| 1/1 [00:04<00:00,  4.28s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Processing models: 100%|██████████| 5/5 [02:20<00:00, 28.01s/it]


Processing row 32...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:20<00:00,  4.06s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.20s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.89s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.14s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:12<00:00,  4.33s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:03<00:00,  1.33s/it]
Processing models: 100%|██████████| 5/5 [02:22<00:00, 28.43s/it]


Processing row 33...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.31s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.26s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.32s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.86s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:11<00:00,  3.78s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:03<00:00,  1.13s/it]
Processing models: 100%|██████████| 5/5 [02:14<00:00, 26.93s/it]


Processing row 34...


Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:21<00:00,  3.58s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:06<00:00,  1.02s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:23<00:00,  4.71s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.39s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.26s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:25<00:00,  5.13s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.93s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing models: 100%|██████████| 5/5 [02:55<00:00, 35.17s/it]


Processing row 35...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.82s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.78s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.19s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.76s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.93s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing models: 100%|██████████| 5/5 [02:21<00:00, 28.33s/it]


Processing row 36...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.49s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.28s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.46s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:08<00:00,  1.76s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.28s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]
Processing cands gpt-4-0613: 100%|██████████| 2/2 [00:10<00:00,  5.03s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
Processing models: 100%|██████████| 5/5 [02:34<00:00, 30.87s/it]


Processing row 37...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.41s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:22<00:00,  4.54s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:23<00:00,  4.71s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.24s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:29<00:00,  4.95s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:05<00:00,  1.01it/s]
Processing models: 100%|██████████| 5/5 [02:52<00:00, 34.45s/it]


Processing row 38...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.38s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.65s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.18s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.34s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.29s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]
Processing cands gpt-4-0613: 100%|██████████| 2/2 [00:06<00:00,  3.32s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]
Processing models: 100%|██████████| 5/5 [02:12<00:00, 26.49s/it]


Processing row 39...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.59s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:23<00:00,  4.64s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.85s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.15s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.56s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 7/7 [00:38<00:00,  5.50s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 7/7 [00:07<00:00,  1.04s/it]
Processing models: 100%|██████████| 5/5 [02:53<00:00, 34.71s/it]


Processing row 40...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.57s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:06<00:00,  1.23s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.86s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.01s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.23s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.70s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.18s/it]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:10<00:00,  3.60s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:02<00:00,  1.04it/s]
Processing models: 100%|██████████| 5/5 [02:18<00:00, 27.67s/it]


Processing row 41...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.58s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.78s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.43s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.13s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:15<00:00,  3.16s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:17<00:00,  3.47s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing models: 100%|██████████| 5/5 [02:21<00:00, 28.35s/it]


Processing row 42...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.64s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.00it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.67s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.10it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.92s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.76s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
Processing cands gpt-4-0613: 100%|██████████| 7/7 [00:29<00:00,  4.23s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 7/7 [00:06<00:00,  1.04it/s]
Processing models: 100%|██████████| 5/5 [02:33<00:00, 30.68s/it]


Processing row 43...


Processing cands gpt-4-0613: 100%|██████████| 6/6 [00:22<00:00,  3.68s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 6/6 [00:05<00:00,  1.00it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:16<00:00,  3.38s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.37s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:07<00:00,  1.42s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:19<00:00,  3.83s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Processing cands gpt-4-0613: 100%|██████████| 3/3 [00:12<00:00,  4.12s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 3/3 [00:02<00:00,  1.10it/s]
Processing models: 100%|██████████| 5/5 [02:26<00:00, 29.29s/it]


Processing row 44...


Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:14<00:00,  2.91s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:21<00:00,  4.37s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.16it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.63s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing cands gpt-4-0613: 100%|██████████| 5/5 [00:18<00:00,  3.71s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Processing cands gpt-4-0613: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it]
Processing cands gpt-3.5-turbo: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
Processing models: 100%|██████████| 5/5 [02:02<00:00, 24.50s/it]
