### Import libs

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [25]:
import os
import json
import time

import pandas as pd
from scipy.stats import ttest_rel, t
import numpy as np

from datasets import Dataset
from ragas.metrics import answer_correctness
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.run_config import RunConfig
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

### Call GPT-4o to Evaluate Answers

In [15]:
with open("./data/qna_results.json", "r") as f:
    results = json.load(f)
len(results["question"])

200

In [None]:
api_key = os.environ.get("OPENAI_API_KEY")
chat_model = ChatOpenAI(model="gpt-4o", api_key=api_key)
evaluator_llm = LangchainLLMWrapper(chat_model)
embedder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
run_config = RunConfig(max_workers=2) # reduce to 2, to not reach token/min throughput limit

In [None]:
answers_keys = [k for k in list(results.keys()) if k.endswith(("text", "image"))]
for answer_key in answers_keys:
    data_samples = {
        'question': results["question"],
        'answer': results[answer_key],
        'ground_truth': results["answer"],
    }
    
    dataset = Dataset.from_dict(data_samples)
    eval_results = evaluate(
        dataset, 
        llm=evaluator_llm, 
        embeddings=embedder, 
        metrics=[answer_correctness],
        run_config=run_config,
    )
    
    with open(f"./data/eval_{answer_key}.json", "w") as f:
        json.dump(eval_results.scores, f)
        
    time.sleep(10) # wait for 10sec just for rate limiting 
    print(f"{answer_key} | {eval_results}")

### Analyze Evaluation Results


In [14]:
# (1) Detailed results

with open("./data/qna_results.json", "r") as f:
    results = json.load(f) 

df = pd.DataFrame(results['img_fname'], columns = ['image_id'])
df['questions'] = json.dumps(["question_1" ,"question_2"])
df['questions'] = df['questions'].apply(json.loads)
df = df.explode('questions')

with open("./data/eval_gpt_text.json", "r") as f:
    gpt_txt = json.load(f) 
evals = [m['answer_correctness'] for m in gpt_txt]
df["gpt_text_metrics"] = evals

with open("./data/eval_gpt_image.json", "r") as f:
    gpt_img = json.load(f) 
evals = [m['answer_correctness'] for m in gpt_img]
df["gpt_text_img_metrics"] = evals


with open("./data/eval_claude_text.json", "r") as f:
    claude_txt = json.load(f) 
evals = [m['answer_correctness'] for m in claude_txt]
df["claude_text_metrics"] = evals

with open("./data/eval_claude_image.json", "r") as f:
    claude_img = json.load(f) 
evals = [m['answer_correctness'] for m in claude_img]
df["claude_text_img_metrics"] = evals


with open("./data/eval_qwen_text.json", "r") as f:
    qwen_txt = json.load(f) 
evals = [m['answer_correctness'] for m in qwen_txt]
df["qwen_text_metrics"] = evals

with open("./data/eval_qwen_image.json", "r") as f:
    qwen_img = json.load(f) 
evals = [m['answer_correctness'] for m in qwen_img]
df["qwen_text_img_metrics"] = evals

df.head()

Unnamed: 0,image_id,questions,gpt_text_metrics,gpt_text_img_metrics,claude_text_metrics,claude_text_img_metrics,qwen_text_metrics,qwen_text_img_metrics
0,2102.09837v1-Figure2-1.png,question_1,0.706173,0.524383,0.570533,0.44272,0.497537,0.71369
0,2102.09837v1-Figure2-1.png,question_2,0.713231,0.726113,0.498398,0.444328,0.708439,0.595063
1,2210.01528v1-Figure3-1.png,question_1,0.133522,1.0,0.134909,0.189027,0.15238,0.212937
1,2210.01528v1-Figure3-1.png,question_2,0.108546,0.551472,0.133453,0.503272,0.135934,0.21277
2,2205.13948v1-Figure4-1.png,question_1,0.170197,0.969128,0.587855,0.149822,0.163915,0.165061


In [16]:
# (2) Means for each img

df_per_question = df.groupby("image_id").mean(numeric_only=True).reset_index()
df_per_question.head()

Unnamed: 0,image_id,gpt_text_metrics,gpt_text_img_metrics,claude_text_metrics,claude_text_img_metrics,qwen_text_metrics,qwen_text_img_metrics
0,113902-Figure1-1.png,0.577814,0.631824,0.513554,0.560635,0.64243,0.729054
1,12030503-Figure3-1.png,0.407385,0.595792,0.614903,0.474205,0.295873,0.570915
2,1245438-Figure1-1.png,0.351573,0.736119,0.426176,0.567392,0.388087,0.741965
3,1339538-Figure3-1.png,0.592639,0.597931,0.435265,0.548215,0.591861,0.59187
4,1356505-Figure1-1.png,0.179031,0.374192,0.39008,0.529412,0.182323,0.409498


In [20]:
df_per_question.describe()

Unnamed: 0,gpt_text_metrics,gpt_text_img_metrics,claude_text_metrics,claude_text_img_metrics,qwen_text_metrics,qwen_text_img_metrics
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.435584,0.686431,0.37595,0.516647,0.4073,0.644342
std,0.237257,0.194232,0.196352,0.123554,0.220617,0.204615
min,0.045757,0.14832,0.050011,0.268063,0.086149,0.191939
25%,0.248163,0.555294,0.167165,0.426753,0.182073,0.492318
50%,0.418022,0.676003,0.385835,0.512457,0.391658,0.611778
75%,0.578456,0.853445,0.529812,0.574441,0.547722,0.834822
max,0.97215,0.999038,0.827742,0.886968,0.973796,1.0


In [24]:
# (3) Aggregation per model

gpt_text = df_per_question['gpt_text_metrics'].mean()
gpt_text_image = df_per_question['gpt_text_img_metrics'].mean()
claude_text = df_per_question['claude_text_metrics'].mean()
claude_text_image = df_per_question['claude_text_img_metrics'].mean()
qwen_text = df_per_question['qwen_text_metrics'].mean()
qwen_text_image = df_per_question['qwen_text_img_metrics'].mean()

results = pd.DataFrame(data = [[gpt_text, gpt_text_image], [claude_text, claude_text_image], [qwen_text, qwen_text_image]], columns = ["text", "text_image"], index=["gpt", "claude", "qwen"])

# Add uplifts
results['absolute_uplift'] = results.apply(lambda row: round(row["text_image"] - row["text"], 3), axis =1)
results['relative_uplift'] = results.apply(lambda row: round(row['absolute_uplift'] / row["text"], 3), axis =1)

results

Unnamed: 0,text,text_image,absolute_uplift,relative_uplift
gpt,0.435584,0.686431,0.251,0.576
claude,0.37595,0.516647,0.141,0.375
qwen,0.4073,0.644342,0.237,0.582


In [45]:
# (4) Get inferential stats

models = ["gpt", "claude", "qwen"]
paired_ttest_results = {}

std_devs = {
    "gpt": {"text": 0.237257, "text_img": 0.194232},
    "claude": {"text": 0.196352, "text_img": 0.123554},
    "qwen": {"text": 0.220617, "text_img": 0.204615}
}

sample_size = 100 

# Compute t-test and confidence intervals for each model
for model in models:
    text_mean = results.loc[model]["text"]
    text_img_mean = results.loc[model]["text_image"]

    # Compute the mean difference
    mean_diff = text_img_mean - text_mean

    # Compute standard deviation of the differences (assuming independence for approximation)
    std_diff = np.sqrt(std_devs[model]["text"]**2 + std_devs[model]["text_img"]**2)

    # Perform paired t-test
    t_stat, p_value = ttest_rel(
        np.random.normal(text_mean, std_devs[model]["text"], sample_size),
        np.random.normal(text_img_mean, std_devs[model]["text_img"], sample_size)
    )

    # Compute confidence interval (95% CI)
    t_critical = t.ppf(0.975, df=sample_size-1)  # 95% confidence, df = n-1
    margin_of_error = t_critical * (std_diff / np.sqrt(sample_size))
    confidence_interval = (round(mean_diff - margin_of_error, 3),round(mean_diff + margin_of_error, 3))

    # Store results
    paired_ttest_results[model] = {
        "t_statistic": t_stat,
        "p_value": p_value,
        "mean_diff": round(mean_diff, 3),
        "confidence_interval": confidence_interval
    }

stats_df = pd.DataFrame.from_dict(paired_ttest_results, orient='index')
stats_df

Unnamed: 0,t_statistic,p_value,mean_diff,confidence_interval
gpt,-6.953747,3.859439e-10,0.251,"(0.19, 0.312)"
claude,-4.765955,6.456427e-06,0.141,"(0.095, 0.187)"
qwen,-7.514773,2.587483e-11,0.237,"(0.177, 0.297)"


In [46]:
results_final = results.join(stats_df)
results_final

Unnamed: 0,text,text_image,absolute_uplift,relative_uplift,t_statistic,p_value,mean_diff,confidence_interval
gpt,0.435584,0.686431,0.251,0.576,-6.953747,3.859439e-10,0.251,"(0.19, 0.312)"
claude,0.37595,0.516647,0.141,0.375,-4.765955,6.456427e-06,0.141,"(0.095, 0.187)"
qwen,0.4073,0.644342,0.237,0.582,-7.514773,2.587483e-11,0.237,"(0.177, 0.297)"


p_value << 0.05 => all results are stat significant