In [63]:
import random
import json
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
import nltk

random.seed(6)
np.random.seed(6)


# Experiment 7 - Llama3.1 8b as a judge

In this experiment I implement the technique `LLM as judge` used in the paper `How good is my MML?` as evaluaiton metric for the correctness of the answers given by a model. The method 
consists of using a LLM to evaluate the correctness of the answers given by another model. The LLM acts as a judge by comparing the predicted answer with the ground truth answer and providing a score between 0 and 5, expressing the confidence in the answer. The judge also provides a reason for the score.


**Parameters:**
- model: llama3.1 8b
- system prompt: LLM_judge_system.txt
- temperature: 0.1
- sampling method: (default) top-p (p=0.9)
- the output is limited to 128 tokens (shouldn't influence the results, it only cutoffs the generation)


Let's load the ground truth from the `STAR_QA_question_and_stsg_val.json` file where we extracted QA and spatio-temporal scene graphs

In [84]:
ground_truth = []

with open('../data/datasets/STAR_QA_and_stsg_val.json') as f:
    data = json.load(f)
    ground_truth = [{
        'id': sample['question_id'],
        'question': sample['question'],
        'choices': sample['choices'],
        'num': sample['answer'],
        'text': sample['choices'][str(sample['answer'])]
    } for sample in data]


# we keep num as str so we can directrly access choices dictionaries
gt_df = pd.DataFrame(ground_truth).astype({'id': 'string', 'num': 'string', 'text': 'string'})
gt_df.set_index('id', inplace=True)
gt_df


Unnamed: 0_level_0,question,choices,num,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Interaction_T1_13,Which object was tidied up by the person?,"{'0': 'The closet/cabinet.', '1': 'The blanket...",2,The clothes.
Interaction_T1_14,Which object was tidied up by the person?,"{'0': 'The blanket.', '1': 'The table.', '2': ...",2,The clothes.
Interaction_T1_31,Which object was thrown by the person?,"{'0': 'The pillow.', '1': 'The bag.', '2': 'Th...",2,The clothes.
Interaction_T1_32,Which object was put down by the person?,"{'0': 'The food.', '1': 'The shoe.', '2': 'The...",1,The shoe.
Interaction_T1_40,Which object was tidied up by the person?,"{'0': 'The broom.', '1': 'The closet/cabinet.'...",1,The closet/cabinet.
...,...,...,...,...
Feasibility_T6_1453,What is the person able to do after taking the...,"{'0': 'Wash the table.', '1': 'Take the box.',...",0,Wash the table.
Feasibility_T6_1454,What is the person able to do after walking th...,"{'0': 'Take the towel.', '1': 'Throw the bag.'...",0,Take the towel.
Feasibility_T6_1455,What is the person able to do after walking th...,"{'0': 'Throw the bag.', '1': 'Wash the table.'...",1,Wash the table.
Feasibility_T6_1456,What is the person able to do after putting th...,"{'0': 'Hold the food.', '1': 'Open the closet/...",3,Wash the table.


## Loading the evaluation for the originally incorrect answers

In [72]:
# Load the scores for the wrong answers
predictions = []
with open('../outputs/llama8b_as_judge_wrong.jsonl') as f:
    predictions = [json.loads(line) for line in f.readlines()]

# Extract pred, score and reason from response dictionary into separate columns
judge_pred_df = pd.DataFrame(predictions).rename(columns={'qid':'id'})
judge_pred_df['pred'] = judge_pred_df['response'] \
    .apply(lambda x: eval(x)['pred']) \
    .astype('string') 

judge_pred_df['score'] = judge_pred_df['response'] \
    .apply(lambda x: int(eval(x)['score'])) \
    .astype('int32')

judge_pred_df['reason'] = judge_pred_df['response'] \
    .apply(lambda x: eval(x)['reason']) \
    .astype('string')

judge_pred_df.drop('response', axis=1, inplace=True)
judge_pred_df.set_index('id', inplace=True)
judge_pred_df


Unnamed: 0_level_0,pred,score,reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interaction_T1_40,incorrect,2,The predicted answer is incorrect because it s...
Interaction_T1_71,incorrect,2,The predicted answer is incorrect because it i...
Interaction_T1_78,incorrect,0,The predicted answer is incorrect because it d...
Interaction_T1_95,incorrect,0,The predicted answer is incorrect because it c...
Interaction_T1_100,incorrect,2,The predicted answer is incorrect because it s...
...,...,...,...
Feasibility_T6_1414,incorrect,2,"The predicted answer ""throw the bag"" does not ..."
Feasibility_T6_1430,incorrect,0,The predicted answer 'Take the pillow' is not ...
Feasibility_T6_1454,incorrect,2,The predicted answer is wiping the table (acti...
Feasibility_T6_1455,incorrect,2,The predicted answer 'Take the shoe' does not ...


In [73]:
judge_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3862 entries, Interaction_T1_40 to Feasibility_T6_1456
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pred    3862 non-null   string
 1   score   3862 non-null   int32 
 2   reason  3862 non-null   string
dtypes: int32(1), string(2)
memory usage: 105.6+ KB


In [67]:
print("Percentage of predictions:")
print(judge_pred_df['pred'].value_counts(normalize=True) * 100)

Percentage of predictions:
incorrect    95.649922
correct       4.350078
Name: pred, dtype: Float64


In [74]:
judge_pred_df

Unnamed: 0_level_0,pred,score,reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interaction_T1_40,incorrect,2,The predicted answer is incorrect because it s...
Interaction_T1_71,incorrect,2,The predicted answer is incorrect because it i...
Interaction_T1_78,incorrect,0,The predicted answer is incorrect because it d...
Interaction_T1_95,incorrect,0,The predicted answer is incorrect because it c...
Interaction_T1_100,incorrect,2,The predicted answer is incorrect because it s...
...,...,...,...
Feasibility_T6_1414,incorrect,2,"The predicted answer ""throw the bag"" does not ..."
Feasibility_T6_1430,incorrect,0,The predicted answer 'Take the pillow' is not ...
Feasibility_T6_1454,incorrect,2,The predicted answer is wiping the table (acti...
Feasibility_T6_1455,incorrect,2,The predicted answer 'Take the shoe' does not ...


Load the orignal model predictions to verify the judgement of the model.

In [69]:
original_pred = []
with open('../data/llama3b_wrongs.jsonl') as f:
    original_pred = [json.loads(line) for line in f.readlines()]

original_pred_df = pd.DataFrame(original_pred).astype({'question_id': 'string', 'response': 'string'})
original_pred_df.rename(columns={'question_id': 'id'}, inplace=True)
original_pred_df.set_index('id', inplace=True)

In [75]:
judged_correct_mask = judge_pred_df['pred'] == 'correct'

original_pred_df.loc[judged_correct_mask]

Unnamed: 0_level_0,response
id,Unnamed: 1_level_1
Interaction_T1_375,<Reasoning> The question asks which object was...
Interaction_T1_393,<Reasoning> To determine which object was tidi...
Interaction_T1_1312,"<Reasoning> To answer this question, we need t..."
Interaction_T1_1513,<Reasoning> To determine which object was clos...
Interaction_T1_1680,<Reasoning> To determine which object was take...
...,...
Feasibility_T5_902,<Reasoning> The question asks which object the...
Feasibility_T5_906,<Reasoning> The question asks which object the...
Feasibility_T5_928,"<Reasoning> To answer this question, we need t..."
Feasibility_T5_981,<Reasoning> The question asks which object the...


Let's analyze some of the answers

First we execute the usual pipeline to extract the answers with a regex.

In [76]:
ans_regex_pattern = r'A:\s*(?:\d\.)?\s*((?:\w+(?:\s|\/)?){,10}\.)'

extracted_ans_df = original_pred_df['response'].str.extract(ans_regex_pattern)
extracted_ans_df.rename(columns={0: 'text'}, inplace=True)

extracted_ans_df['text'] = extracted_ans_df['text'].str.strip()
extracted_ans_df

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
Interaction_T1_40,The blanket.
Interaction_T1_71,The shoe.
Interaction_T1_78,The refrigerator.
Interaction_T1_95,The clothes.
Interaction_T1_100,The clothes.
...,...
Feasibility_T6_1414,Throw the bag.
Feasibility_T6_1430,Take the pillow.
Feasibility_T6_1454,Wipe the table.
Feasibility_T6_1455,Take the shoe.


In [115]:
idx = random.choice(judged_correct_mask[judged_correct_mask].index)

print(f"Index: {idx}")
print(f"Question: {gt_df.loc[idx]['question']}")
print(f"Ground truth answer: {gt_df.loc[idx]['text']}")
print(f"Extracted answer: {extracted_ans_df.loc[idx]['text']}")
print(f"\nOriginal answer:\n\n{original_pred_df.loc[idx]['response']}")
print(f"\nJudge sentence:\n\n"
      f"sentence: {judge_pred_df.loc[idx]['pred']}\n"
      f"score: {judge_pred_df.loc[idx]['score']}\n"
      f"reason: {judge_pred_df.loc[idx]['reason']}\n")

Index: Sequence_T6_4444
Question: What did the person do to the food before holding the dish?
Ground truth answer: Took.
Extracted answer: Put down.

Original answer:

<Reasoning>
To answer this question, we need to analyze the sequence of events unfolding in the video. We are given a Spatio-Temporal Scene Graph (STSG) that represents the scene in chronological order.

From the STSG, we can see that the person is initially touching their laptop ('person - touching - laptop') and then moves to touch the food ('person - on_the_side_of - food'). After that, they move to touch the laptop again ('person - touching - laptop', 'person - in_front_of - laptop'), indicating that they are now holding the dish.

Therefore, before holding the dish, the person must have done something to the food. The only action mentioned in the STSG that involves the food is 'person - on_the_side_of - food'. This suggests that the person was standing next to the food before holding the dish.

Considering the alter

The model seems confident that the predicted answer mathces the ground truth but as we can see above this is not true. Can it be an error in the prompt? Or did the judge llm *"hallucinate"*?

In [90]:
prompts = []
with open('../data/wrong_prompts.jsonl') as f:
    prompts = [json.loads(line) for line in f.readlines()]

prompts_df = pd.DataFrame(prompts)
prompts_df.rename(columns={'qid': 'id'}, inplace=True)
prompts_df.set_index('id', inplace=True)
prompts_df

Unnamed: 0_level_0,prompt
id,Unnamed: 1_level_1
Interaction_T1_40,Please evaluate the following question-answer ...
Interaction_T1_71,Please evaluate the following question-answer ...
Interaction_T1_78,Please evaluate the following question-answer ...
Interaction_T1_95,Please evaluate the following question-answer ...
Interaction_T1_100,Please evaluate the following question-answer ...
...,...
Feasibility_T6_1414,Please evaluate the following question-answer ...
Feasibility_T6_1430,Please evaluate the following question-answer ...
Feasibility_T6_1454,Please evaluate the following question-answer ...
Feasibility_T6_1455,Please evaluate the following question-answer ...


In [95]:
print(prompts_df.loc[idx]['prompt'])

Please evaluate the following question-answer pair:
Question: Which object was tidied up by the person?

Ground truth correct Answer: 
[START ANSWER]
The blanket.
[END ANSWER]

Predicted Answer:
[START PREDICTION]
<Reasoning>
To determine which object was tidied up by the person, we need to analyze the Scene-Graphs and identify the relationships between the person and the objects.

From the given Spatio-Temporal Scene-Graphs, we can see that there are multiple instances where the person is holding or in front of the clothes. However, the question asks which object was tidied up by the person.

One possible interpretation is that the person is tidying up the clothes. This is because the person is often holding or in front of the clothes, and there are no other objects being held or placed in front of the person that would suggest they are tidying up something else.

Another possibility is that the person is tidying up the towel. However, this is not supported by the Scene-Graphs, as the

As we can see above, the prompt reported the ground truth accurately, thus is the model that missed completely the prediction.

In [106]:
actually_true = ['Prediction_T3_819', 'Sequence_T5_2830', 'Sequence_T1_3189']

## Loading the evaluation for the originally correct answers

In [20]:
# Load the scores for the wrong answers
predictions = []
with open('../outputs/llama8b_as_judge_correct.jsonl') as f:
    predictions = [json.loads(line) for line in f.readlines()]

# Extract pred, score and reason from response dictionary into separate columns
predictions_df = pd.DataFrame(predictions).rename(columns={'qid':'id'})
predictions_df['pred'] = predictions_df['response'] \
    .apply(lambda x: eval(x)['pred']) \
    .astype('string') 

predictions_df['score'] = predictions_df['response'] \
    .apply(lambda x: int(eval(x)['score'])) \
    .astype('int32')

predictions_df['reason'] = predictions_df['response'] \
    .apply(lambda x: eval(x)['reason']) \
    .astype('string')

predictions_df.drop('response', axis=1, inplace=True)
predictions_df.set_index('id', inplace=True)
predictions_df


Unnamed: 0_level_0,pred,score,reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interaction_T1_13,correct,5,The predicted answer matches the ground truth ...
Interaction_T1_14,correct,5,The predicted answer matches the ground truth ...
Interaction_T1_32,correct,5,The predicted answer accurately identifies the...
Interaction_T1_43,correct,5,The predicted answer matches the ground truth ...
Interaction_T1_81,correct,5,The predicted answer matches the ground truth ...
...,...,...,...
Feasibility_T6_1428,correct,5,The predicted answer matches the ground truth ...
Feasibility_T6_1448,correct,5,The predicted answer matches the ground truth ...
Feasibility_T6_1449,correct,5,The predicted answer is correct because it dir...
Feasibility_T6_1453,incorrect,2,The predicted answer is close but not entirely...


In [21]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3038 entries, Interaction_T1_13 to Feasibility_T6_1468
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pred    3038 non-null   string
 1   score   3038 non-null   int32 
 2   reason  3038 non-null   string
dtypes: int32(1), string(2)
memory usage: 83.1+ KB


In [22]:
predictions_df.groupby('pred').count()

Unnamed: 0_level_0,score,reason
pred,Unnamed: 1_level_1,Unnamed: 2_level_1
correct,2361,2361
incorrect,677,677


In [23]:
predictions_df[predictions_df['pred'] == 'correct']

Unnamed: 0_level_0,pred,score,reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interaction_T1_13,correct,5,The predicted answer matches the ground truth ...
Interaction_T1_14,correct,5,The predicted answer matches the ground truth ...
Interaction_T1_32,correct,5,The predicted answer accurately identifies the...
Interaction_T1_43,correct,5,The predicted answer matches the ground truth ...
Interaction_T1_81,correct,5,The predicted answer matches the ground truth ...
...,...,...,...
Feasibility_T6_1413,correct,5,The predicted answer matches the ground-truth ...
Feasibility_T6_1428,correct,5,The predicted answer matches the ground truth ...
Feasibility_T6_1448,correct,5,The predicted answer matches the ground truth ...
Feasibility_T6_1449,correct,5,The predicted answer is correct because it dir...


## Extracting the answers

Let's extract the the answers from the generated texts which do contain an answer:

In [259]:
ans_df = predictions_df.loc[contains_answer]['answer'].str.extract(ans_regex_pattern)
ans_df.rename(columns={0: 'text'}, inplace=True)

ans_df['text'] = ans_df['text'].str.strip()
ans_df


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
Interaction_T1_13,The clothes.
Interaction_T1_14,The clothes.
Interaction_T1_32,The shoe.
Interaction_T1_40,The clothes.
Interaction_T1_43,None of the alternatives were thrown by the pe...
...,...
Feasibility_T6_1453,The person is wiping the table after taking th...
Feasibility_T6_1454,Take the towel.
Feasibility_T6_1455,None of the above
Feasibility_T6_1456,Hold the food.


In [None]:
# removing the full stop for answers that have it
ans_df['text'] = ans_df['text'].apply(lambda x: x[:-1] if x.endswith('.') else x)
ans_df


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
Interaction_T1_13,The clothes
Interaction_T1_14,The clothes
Interaction_T1_32,The shoe
Interaction_T1_40,The clothes
Interaction_T1_43,None of the alternatives were thrown by the pe...
...,...
Feasibility_T6_1453,The person is wiping the table after taking th...
Feasibility_T6_1454,Take the towel
Feasibility_T6_1455,None of the above
Feasibility_T6_1456,Hold the food


In [261]:
freq = ans_df['text'].value_counts().reset_index()
freq.columns = ['word', 'freq']

print(f"We have {freq.size} unique answers")


We have 2570 unique answers


In [262]:
ans_df['text'].value_counts()


None of the above                                             936
Put down                                                      254
Opened                                                        184
The clothes                                                   166
Took                                                          161
                                                             ... 
The refrigerator was closed after the person took a cup         1
The question cannot be answered as it stands because there      1
The person tidied up their clothes after they opened the        1
The person put down a shoe after they sat on                    1
The person is wiping the table after taking the towel           1
Name: text, Length: 1285, dtype: int64

Let's do some visualization also for the ground truth data

In [263]:
print(f"We have {gt_df['text'].nunique()} unique answers")


We have 198 unique answers


In [264]:
# Inner join - keeps only indices present in both series

ans_df.rename(columns={'text': 'pred_text'}, inplace=True)
eval_df = gt_df.join(
    ans_df, 
    how='inner' # we account also for samples without answers
)

eval_df.shape


(6802, 4)

In [265]:
def accuracy(eval_df, on_what='text'):
    hits_text = (eval_df[f'pred_{on_what}'] == eval_df[on_what]).sum()
    

    return hits_text/eval_df.shape[0]


In [266]:
text_acc = accuracy(eval_df)

print(f"Text Accuracy: {text_acc:.2%}")


Text Accuracy: 29.43%


In [267]:
eval_df[eval_df['text'] == eval_df['pred_text']]


Unnamed: 0_level_0,choices,num,text,pred_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Interaction_T1_13,"{'0': 'The closet/cabinet.', '1': 'The blanket...",2,The clothes,The clothes
Interaction_T1_14,"{'0': 'The blanket.', '1': 'The table.', '2': ...",2,The clothes,The clothes
Interaction_T1_32,"{'0': 'The food.', '1': 'The shoe.', '2': 'The...",1,The shoe,The shoe
Interaction_T1_71,"{'0': 'The paper/notebook.', '1': 'The clothes...",0,The paper/notebook,The paper/notebook
Interaction_T1_100,"{'0': 'The table.', '1': 'The towel.', '2': 'T...",3,The blanket,The blanket
...,...,...,...,...
Feasibility_T6_1375,"{'0': 'Put down the clothes.', '1': 'Take the ...",2,Sit at the table,Sit at the table
Feasibility_T6_1412,"{'0': 'Put down the box.', '1': 'Hold the book...",1,Hold the book,Hold the book
Feasibility_T6_1414,"{'0': 'Throw the bag.', '1': 'Close the laptop...",3,Open the book,Open the book
Feasibility_T6_1428,"{'0': 'Close the refrigerator.', '1': 'Throw t...",3,Take the cup/glass/bottle,Take the cup/glass/bottle


check if the equality with NA result in a false or is skipped

In [268]:
real_acc = accuracy(eval_df) * eval_df.shape[0] / gt_df.shape[0]

print(f"Real Accuracy: {real_acc:.2%}")


Real Accuracy: 28.21%


In [269]:
def print_acc(eval_df, acc_fn):
    print(f"{'Question type':<15}{'Total':^15}{'Accuracy':^10}\n")

    total = eval_df.index.str.startswith('Interaction').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Interaction')])
    print(f"{'Interaction':<15}{total:^15}{acc:^10.2%}")

    total = eval_df.index.str.startswith('Sequence').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Sequence')])
    print(f"{'Sequence':<15}{total:^15}{acc:^10.2%}")

    total = eval_df.index.str.startswith('Prediction').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Prediction')])
    print(f"{'Prediction':<15}{total:^15}{acc:^10.2%}")

    total = eval_df.index.str.startswith('Feasibility').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Feasibility')])
    print(f"{'Feasibility':<15}{total:^15}{acc:^10.2%}")


In [270]:
print_acc(eval_df, lambda x: accuracy(x, on_what='text'))


Question type       Total      Accuracy 

Interaction         2310        25.80%  
Sequence            3409        31.68%  
Prediction           599        31.22%  
Feasibility          484        28.72%  


In [271]:
index_intersect = predictions_df[html_tags_mask].index.intersection(eval_df.index)
print_acc(eval_df.loc[index_intersect], accuracy)


Question type       Total      Accuracy 

Interaction         1179        44.61%  
Sequence            1792        47.88%  
Prediction           333        39.64%  
Feasibility          267        42.70%  


In [272]:
# % of answers conforming to template per category

def print_ans_perc(eval_df, gt_df):
    print(f"{'Question type':<15}{'Total':^15}{'Answered':^10}\n")

    total = gt_df.index.str.startswith('Interaction').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Interaction')].index))/total
    print(f"{'Interaction':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.index.str.startswith('Sequence').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Sequence')].index))/total
    print(f"{'Sequence':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.index.str.startswith('Prediction').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Prediction')].index))/total
    print(f"{'Prediction':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.index.str.startswith('Feasibility').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Feasibility')].index))/total
    print(f"{'Feasibility':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.shape[0]
    acc = eval_df.shape[0]/total
    print(f"{'Overall':<15}{total:^15}{acc:^10.2%}")

print_ans_perc(eval_df, gt_df)


Question type       Total      Answered 

Interaction         2398        96.33%  
Sequence            3586        95.06%  
Prediction           624        95.99%  
Feasibility          490        98.78%  
Overall             7098        95.83%  
