In [1]:
#!pip install -U langchain-openai python-dotenv

In [2]:
import os
import time
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from langchain_openai import ChatOpenAI 
from langchain.evaluation import load_evaluator
from pprint import pprint

import os
from dotenv import load_dotenv

load_dotenv()
open_ai_auth = os.getenv("OPEN_AI")

os.environ["OPENAI_API_KEY"] = open_ai_auth

evaluation_llm = ChatOpenAI(model="gpt-3.5-turbo")

# Causal models on SQuADv2

## Get scores from GPT-3.5 tubo

In [6]:
results_path = Path("./results/")

evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=evaluation_llm,requires_reference=True)

results = dict()

for file in results_path.glob("*csv"):
    # read results csv
    df = pd.read_csv(file)
    df = df[["model", "question", "answer", "context"]]
    model = df["model"].values[0]
    results[model] = []
    result_rows = []
    with tqdm(total=len(df)) as pbar:
        for i, (model, question, answer, context) in df.iterrows():
            eval_result = evaluator.evaluate_strings(
                prediction=answer,
                input=question,
                reference=context
            )
            pbar.update(1)
            
            results[model].append(eval_result)
            result_rows.append(eval_result)
            # OpenAI API Tokens per minute limit
            if i+1 % 50 == 0:
                time.sleep(60)
            pd.DataFrame(result_rows).to_csv(f"{model.replace('/', '-')}.csv")

100%|██████████| 500/500 [20:17<00:00,  2.44s/it]
100%|██████████| 500/500 [24:00<00:00,  2.88s/it] 


## Evaluate results

In [18]:
from pathlib import Path

results_path = Path("llm-as-judge-results")

result_rows = []

for csv in results_path.glob("*.csv"):
    df = pd.read_csv(csv, index_col=0).fillna(0.0)
    correct = df.score.sum()
    total = len(df)
    percentage = correct / total
    model = csv.name
    
    result_rows.append(
        {
            "model": model,
            "correct": correct,
            "wrong": total - correct,
            "total": total,
            "percentage": percentage
        }
    )
result = pd.DataFrame(result_rows)
result.to_csv("squad-v2-llm-as-judge-results.csv")
result

Unnamed: 0,model,correct,wrong,total,percentage
0,HuggingFaceH4-zephyr-7b-gemma-v0.1.csv,337.0,163.0,500,0.674
1,google-gemma-2b-it.csv,387.0,113.0,500,0.774
2,HuggingFaceH4-zephyr-7b-beta.csv,427.0,73.0,500,0.854
3,mistralai-Mixtral-8x7B-Instruct-v0.1.csv,396.0,104.0,500,0.792
4,mistralai-Mistral-7B-Instruct-v0.2.csv,400.0,100.0,500,0.8


# Causal models on ml book dataset

## Get scores from GPT-3.5 tubo

In [3]:
df = pd.read_csv("./results/ML_BOOK_RESULTS.csv", index_col=0)

In [4]:
result_rows = []

evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=evaluation_llm,requires_reference=True)


with tqdm(total=len(df)) as pbar:
    for i, (model, context, answer, question) in df.iterrows():
        eval_result = evaluator.evaluate_strings(
            prediction=answer,
            input=question,
            reference=context
        )
        pbar.update(1)

        result_rows.append(eval_result)
        # OpenAI API Tokens per minute limit
        if i+1 % 50 == 0:
            time.sleep(60)

pd.concat([df, pd.DataFrame(result_rows)], axis = 1).to_csv("LLM-as-a-judge-ML-BOOK.csv")

100%|██████████| 108/108 [06:13<00:00,  3.45s/it]


## Evaluate results

In [5]:
df = pd.read_csv("LLM-as-a-judge-ML-BOOK.csv", index_col=0)

grouped = df.groupby(["model"])

result_rows = []

for name, values in grouped:
    model = name[0]
    correct = values["score"].sum()
    total = len(values["score"])
    percentage = correct / total
    
    
    result_rows.append(
        {
            "model": model,
            "correct": correct,
            "wrong": total - correct,
            "total": total,
            "percentage": percentage
        }
    )

pd.DataFrame(result_rows)

Unnamed: 0,model,correct,wrong,total,percentage
0,HuggingFaceH4/zephyr-7b-beta,25.0,2.0,27,0.925926
1,HuggingFaceH4/zephyr-7b-gemma-v0.1,23.0,4.0,27,0.851852
2,google/gemma-2b-it,21.0,6.0,27,0.777778
3,mistralai/Mistral-7B-Instruct-v0.2,25.0,2.0,27,0.925926
