In [247]:
import random
import json
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
import nltk

random.seed(6)
np.random.seed(6)


# Experiment 6 - MCQ Long answer HTML tags

For this experiment I changed the system prompt, asking the LLM to generate more text (about "thinking") and then give the final answer.


**Parameters:**
- model: Phi3-small (3.8b)
- system prompt: MCQ_system_prompt_v3.txt
- temperature: 0.1
- sampling method: (default) top-p (p=0.9)
- the output is limited to 128 tokens (shouldn't influence the results, it only cutoffs the generation)


Let's load the ground truth from the `STAR_QA_question_and_stsg_val.json` file where we extracted QA and spatio-temporal scene graphs

In [None]:
ground_truth = []

with open('../data/datasets/STAR_QA_and_stsg_val.json') as f:
    data = json.load(f)
    ground_truth = [{
        'id': sample['question_id'],
        'choices': sample['choices'],
        'num': sample['answer'],
        'text': sample['choices'][str(sample['answer'])]
    } for sample in data]


# we keep num as str so we can directrly access choices dictionaries
gt_df = pd.DataFrame(ground_truth).astype({'id': 'string', 'num': 'string', 'text': 'string'})
gt_df.set_index('id', inplace=True)
gt_df


Unnamed: 0_level_0,choices,num,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interaction_T1_13,"{'0': 'The closet/cabinet.', '1': 'The blanket...",2,The clothes.
Interaction_T1_14,"{'0': 'The blanket.', '1': 'The table.', '2': ...",2,The clothes.
Interaction_T1_31,"{'0': 'The pillow.', '1': 'The bag.', '2': 'Th...",2,The clothes.
Interaction_T1_32,"{'0': 'The food.', '1': 'The shoe.', '2': 'The...",1,The shoe.
Interaction_T1_40,"{'0': 'The broom.', '1': 'The closet/cabinet.'...",1,The closet/cabinet.
...,...,...,...
Feasibility_T6_1453,"{'0': 'Wash the table.', '1': 'Take the box.',...",0,Wash the table.
Feasibility_T6_1454,"{'0': 'Take the towel.', '1': 'Throw the bag.'...",0,Take the towel.
Feasibility_T6_1455,"{'0': 'Throw the bag.', '1': 'Wash the table.'...",1,Wash the table.
Feasibility_T6_1456,"{'0': 'Hold the food.', '1': 'Open the closet/...",3,Wash the table.


Loading the responses from the LLM

In [249]:
# needed to modify the output file because of the bug
# I introduced in the code
predictions = []
with open('../outputs/responses_phi3:3.8b_20250218_17:05:47.jsonl') as f:
    predictions = [json.loads(line) for line in f.readlines()]

# transforming the id key from `qid` to `id` for consistency and `response` to `answer`
predictions_df = pd.DataFrame(predictions, dtype='string').rename(columns={'qid':'id', 'response':'answer'})
predictions_df.set_index('id', inplace=True)
predictions_df


Unnamed: 0_level_0,answer
id,Unnamed: 1_level_1
Interaction_T1_13,<Reasoning> From the given Spatio-Temporal Sce...
Interaction_T1_14,<Reasoning> In the given Spatio-Temporal Scene...
Interaction_T1_31,<Reasoning> The given Spatio-Temporal Scene Gr...
Interaction_T1_32,<Reasoning> The sequence of Scene-Graphs shows...
Interaction_T1_40,<Reasoning> The question asks about an object ...
...,...
Feasibility_T6_1453,<Reasoning> From the given Spatio-Temporal Sce...
Feasibility_T6_1454,<Reasoning> From the Spatio-Temporal Scene Gra...
Feasibility_T6_1455,<Reasoning> The Scene-Graphs show that the per...
Feasibility_T6_1456,<Reasoning> The person is repeatedly interacti...


In [273]:
predictions_df.iloc[0:20]['answer'].apply(lambda x: print(x, '\n\n'))


<Reasoning>
From the given Spatio-Temporal Scene Graphs, we can see that initially a person is on one side of clothes and then moves to be in front of them. The subject 'person' continues interacting with these objects by being both above (on top) and below (beneath) the clothes as well as holding onto them multiple times throughout this sequence. There are also instances where they touch or hold a towel, which is on their side initially but later becomes in front of them too. However, there's no mention about cleaning these objects nor any interaction that suggests tidying up such an object like the closet/cabinet or table being involved at all throughout this sequence.
</Reasoning>
<Final answer>
A: 2. The clothes. 


<Reasoning>
In the given Spatio-Temporal Scene Graphs, we can see that there are multiple interactions involving a person and objects. The first interaction is with a towel where 'person' holds it ('holding - towel'), then interacts directly in front of another object w

id
Interaction_T1_13     None
Interaction_T1_14     None
Interaction_T1_31     None
Interaction_T1_32     None
Interaction_T1_40     None
Interaction_T1_43     None
Interaction_T1_70     None
Interaction_T1_71     None
Interaction_T1_76     None
Interaction_T1_78     None
Interaction_T1_81     None
Interaction_T1_95     None
Interaction_T1_100    None
Interaction_T1_101    None
Interaction_T1_104    None
Interaction_T1_106    None
Interaction_T1_111    None
Interaction_T1_137    None
Interaction_T1_139    None
Interaction_T1_147    None
Name: answer, dtype: object

In [251]:
# Check if the the template is somewhere in the answer
html_tags_pattern = r'<Final answer>\s*A:\s*\d\.\s*((?:\w+(?:\s|\/)?){,10}\.)\s*</Final answer>'
html_tags_mask = predictions_df['answer'].str.contains(html_tags_pattern, case=False, regex=True)


print(f"Answer following the template: {html_tags_mask.value_counts()[True]}\n"
      f"{html_tags_mask.value_counts()[True]/predictions_df.size:.2%} of the total")

print(f"\nOnly {html_tags_mask.value_counts()[False]} samples do not contain the answer in the response with the specified format")


  html_tags_mask = predictions_df['answer'].str.contains(html_tags_pattern, case=False, regex=True)


Answer following the template: 12
0.17% of the total

Only 7086 samples do not contain the answer in the response with the specified format


In [252]:
predictions_df.loc[~html_tags_mask].iloc[:20, 0].apply(print)


<Reasoning>
From the given Spatio-Temporal Scene Graphs, we can see that initially a person is on one side of clothes and then moves to be in front of them. The subject 'person' continues interacting with these objects by being both above (on top) and below (beneath) the clothes as well as holding onto them multiple times throughout this sequence. There are also instances where they touch or hold a towel, which is on their side initially but later becomes in front of them too. However, there's no mention about cleaning these objects nor any interaction that suggests tidying up such an object like the closet/cabinet or table being involved at all throughout this sequence.
</Reasoning>
<Final answer>
A: 2. The clothes.
<Reasoning>
In the given Spatio-Temporal Scene Graphs, we can see that there are multiple interactions involving a person and objects. The first interaction is with a towel where 'person' holds it ('holding - towel'), then interacts directly in front of another object whic

id
Interaction_T1_13     None
Interaction_T1_14     None
Interaction_T1_31     None
Interaction_T1_32     None
Interaction_T1_40     None
Interaction_T1_43     None
Interaction_T1_70     None
Interaction_T1_71     None
Interaction_T1_76     None
Interaction_T1_78     None
Interaction_T1_81     None
Interaction_T1_95     None
Interaction_T1_100    None
Interaction_T1_101    None
Interaction_T1_104    None
Interaction_T1_106    None
Interaction_T1_111    None
Interaction_T1_137    None
Interaction_T1_139    None
Interaction_T1_147    None
Name: answer, dtype: object

We can notice that the model follows the template only partially, more specifically it puts only the opening tag \<Final Answer\>. Let's see how many response follow this "partial" template:

In [253]:
# Check if the the template is somewhere in the answer
html_tags_pattern = r'<Final answer>\s*A:\s*\d\.\s*((?:\w+(?:\s|\/)?){,10}\.)\s*(?:</Final answer>)?'
html_tags_mask = predictions_df['answer'].str.contains(html_tags_pattern, case=False, regex=True)


print(f"Answer following the template: {html_tags_mask.value_counts()[True]}\n"
      f"{html_tags_mask.value_counts()[True]/predictions_df.size:.2%} of the total")

print(f"\nOnly {html_tags_mask.value_counts()[False]} samples do not contain the answer in the response with the specified format")


Answer following the template: 3571
50.31% of the total

Only 3527 samples do not contain the answer in the response with the specified format


  html_tags_mask = predictions_df['answer'].str.contains(html_tags_pattern, case=False, regex=True)


Let's start by viewing how many answers follow the template:

In [254]:
# Check if the the template is somewhere in the answer
ans_regex_pattern = r'A:\s*(?:\d\.\s+)((?:\w+(?:\s|\/)?){,10}\.)'
contains_answer = predictions_df['answer'].str.contains(ans_regex_pattern, case=False, regex=True, flags=re.DOTALL)


print(f"Answer following the template: {contains_answer.value_counts()[True]}\n"
      f"{contains_answer.value_counts()[True]/predictions_df.size:.2%} of the total")

print(f"\nOnly {contains_answer.value_counts()[False]} samples do not contain the answer in the response with the specified format")


Answer following the template: 3623
51.04% of the total

Only 3475 samples do not contain the answer in the response with the specified format


  contains_answer = predictions_df['answer'].str.contains(ans_regex_pattern, case=False, regex=True, flags=re.DOTALL)


In [255]:
# Check that contains answer is subset of html_tags
assert (html_tags_mask & contains_answer).sum() == html_tags_mask.sum()


In [256]:
predictions_df.loc[~contains_answer].iloc[:10]['answer'].apply(print)


<Reasoning>
The given Spatio-Temporal Scene Graphs do not provide any direct or indirect information about the person throwing an object. The scenes depicted in these graphs are primarily focused on a person's interaction with clothes and their position relative to various objects such as sofa/couch, bed, etc., but there is no mention of them picking up or moving anything that could be thrown like pillows, bags, boxes, etc.
</Reasoning>
<Final answer>
None of the above
<Reasoning>
This space is reserved for your reasoning about the question. Based on the given Spatio-Temporal Scene Graphs, it seems that there are no instances where a person has interacted with an object in such a way as to throw one. The only interactions mentioned involve holding and being on the side of clothes or standing next to them without any indication of throwing activity.
</Reasoning>
<Final answer>
A: None of the alternatives were thrown by the person according to the given Spatio-Temporal Scene Graphs.
<Rea

id
Interaction_T1_31     None
Interaction_T1_43     None
Interaction_T1_70     None
Interaction_T1_76     None
Interaction_T1_78     None
Interaction_T1_81     None
Interaction_T1_95     None
Interaction_T1_101    None
Interaction_T1_106    None
Interaction_T1_111    None
Name: answer, dtype: object

Not considering the fact that these non-captured answer actually do not contain an a pertitent answer, we can notice that many of the answer omit the number of the choice. Additionally, many answers also lack a full stop at the end. Let’s see if adjusting the regex to exclude these parts helps capture a greater number of answers..

In [257]:
# Check if the the template is somewhere in the answer
ans_regex_pattern = r'A:\s*(?:\d\.)?\s*((?:\w+(?:\s|\/)?){,10}\.?)'
contains_answer = predictions_df['answer'].str.contains(ans_regex_pattern, case=False, regex=True, flags=re.DOTALL)


print(f"Answer following the template: {contains_answer.value_counts()[True]}\n"
      f"{contains_answer.value_counts()[True]/predictions_df.size:.2%} of the total")

print(f"\nOnly {contains_answer.value_counts()[False]} samples do not contain the answer in the response with the specified format")


Answer following the template: 6802
95.83% of the total

Only 296 samples do not contain the answer in the response with the specified format


  contains_answer = predictions_df['answer'].str.contains(ans_regex_pattern, case=False, regex=True, flags=re.DOTALL)


In [258]:
# removing the full stop from the ground truth answers
gt_df['text'] = gt_df['text'].apply(lambda x: x[:-1])
gt_df


Unnamed: 0_level_0,choices,num,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interaction_T1_13,"{'0': 'The closet/cabinet.', '1': 'The blanket...",2,The clothes
Interaction_T1_14,"{'0': 'The blanket.', '1': 'The table.', '2': ...",2,The clothes
Interaction_T1_31,"{'0': 'The pillow.', '1': 'The bag.', '2': 'Th...",2,The clothes
Interaction_T1_32,"{'0': 'The food.', '1': 'The shoe.', '2': 'The...",1,The shoe
Interaction_T1_40,"{'0': 'The broom.', '1': 'The closet/cabinet.'...",1,The closet/cabinet
...,...,...,...
Feasibility_T6_1453,"{'0': 'Wash the table.', '1': 'Take the box.',...",0,Wash the table
Feasibility_T6_1454,"{'0': 'Take the towel.', '1': 'Throw the bag.'...",0,Take the towel
Feasibility_T6_1455,"{'0': 'Throw the bag.', '1': 'Wash the table.'...",1,Wash the table
Feasibility_T6_1456,"{'0': 'Hold the food.', '1': 'Open the closet/...",3,Wash the table


## Extracting the answers

Let's extract the the answers from the generated texts which do contain an answer:

In [259]:
ans_df = predictions_df.loc[contains_answer]['answer'].str.extract(ans_regex_pattern)
ans_df.rename(columns={0: 'text'}, inplace=True)

ans_df['text'] = ans_df['text'].str.strip()
ans_df


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
Interaction_T1_13,The clothes.
Interaction_T1_14,The clothes.
Interaction_T1_32,The shoe.
Interaction_T1_40,The clothes.
Interaction_T1_43,None of the alternatives were thrown by the pe...
...,...
Feasibility_T6_1453,The person is wiping the table after taking th...
Feasibility_T6_1454,Take the towel.
Feasibility_T6_1455,None of the above
Feasibility_T6_1456,Hold the food.


In [None]:
# removing the full stop for answers that have it
ans_df['text'] = ans_df['text'].apply(lambda x: x[:-1] if x.endswith('.') else x)
ans_df


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
Interaction_T1_13,The clothes
Interaction_T1_14,The clothes
Interaction_T1_32,The shoe
Interaction_T1_40,The clothes
Interaction_T1_43,None of the alternatives were thrown by the pe...
...,...
Feasibility_T6_1453,The person is wiping the table after taking th...
Feasibility_T6_1454,Take the towel
Feasibility_T6_1455,None of the above
Feasibility_T6_1456,Hold the food


In [261]:
freq = ans_df['text'].value_counts().reset_index()
freq.columns = ['word', 'freq']

print(f"We have {freq.size} unique answers")


We have 2570 unique answers


In [262]:
ans_df['text'].value_counts()


None of the above                                             936
Put down                                                      254
Opened                                                        184
The clothes                                                   166
Took                                                          161
                                                             ... 
The refrigerator was closed after the person took a cup         1
The question cannot be answered as it stands because there      1
The person tidied up their clothes after they opened the        1
The person put down a shoe after they sat on                    1
The person is wiping the table after taking the towel           1
Name: text, Length: 1285, dtype: int64

Let's do some visualization also for the ground truth data

In [263]:
print(f"We have {gt_df['text'].nunique()} unique answers")


We have 198 unique answers


In [264]:
# Inner join - keeps only indices present in both series

ans_df.rename(columns={'text': 'pred_text'}, inplace=True)
eval_df = gt_df.join(
    ans_df, 
    how='inner' # we account also for samples without answers
)

eval_df.shape


(6802, 4)

In [265]:
def accuracy(eval_df, on_what='text'):
    hits_text = (eval_df[f'pred_{on_what}'] == eval_df[on_what]).sum()
    

    return hits_text/eval_df.shape[0]


In [266]:
text_acc = accuracy(eval_df)

print(f"Text Accuracy: {text_acc:.2%}")


Text Accuracy: 29.43%


In [267]:
eval_df[eval_df['text'] == eval_df['pred_text']]


Unnamed: 0_level_0,choices,num,text,pred_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Interaction_T1_13,"{'0': 'The closet/cabinet.', '1': 'The blanket...",2,The clothes,The clothes
Interaction_T1_14,"{'0': 'The blanket.', '1': 'The table.', '2': ...",2,The clothes,The clothes
Interaction_T1_32,"{'0': 'The food.', '1': 'The shoe.', '2': 'The...",1,The shoe,The shoe
Interaction_T1_71,"{'0': 'The paper/notebook.', '1': 'The clothes...",0,The paper/notebook,The paper/notebook
Interaction_T1_100,"{'0': 'The table.', '1': 'The towel.', '2': 'T...",3,The blanket,The blanket
...,...,...,...,...
Feasibility_T6_1375,"{'0': 'Put down the clothes.', '1': 'Take the ...",2,Sit at the table,Sit at the table
Feasibility_T6_1412,"{'0': 'Put down the box.', '1': 'Hold the book...",1,Hold the book,Hold the book
Feasibility_T6_1414,"{'0': 'Throw the bag.', '1': 'Close the laptop...",3,Open the book,Open the book
Feasibility_T6_1428,"{'0': 'Close the refrigerator.', '1': 'Throw t...",3,Take the cup/glass/bottle,Take the cup/glass/bottle


check if the equality with NA result in a false or is skipped

In [268]:
real_acc = accuracy(eval_df) * eval_df.shape[0] / gt_df.shape[0]

print(f"Real Accuracy: {real_acc:.2%}")


Real Accuracy: 28.21%


In [269]:
def print_acc(eval_df, acc_fn):
    print(f"{'Question type':<15}{'Total':^15}{'Accuracy':^10}\n")

    total = eval_df.index.str.startswith('Interaction').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Interaction')])
    print(f"{'Interaction':<15}{total:^15}{acc:^10.2%}")

    total = eval_df.index.str.startswith('Sequence').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Sequence')])
    print(f"{'Sequence':<15}{total:^15}{acc:^10.2%}")

    total = eval_df.index.str.startswith('Prediction').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Prediction')])
    print(f"{'Prediction':<15}{total:^15}{acc:^10.2%}")

    total = eval_df.index.str.startswith('Feasibility').sum()
    acc = acc_fn(eval_df[eval_df.index.str.startswith('Feasibility')])
    print(f"{'Feasibility':<15}{total:^15}{acc:^10.2%}")


In [270]:
print_acc(eval_df, lambda x: accuracy(x, on_what='text'))


Question type       Total      Accuracy 

Interaction         2310        25.80%  
Sequence            3409        31.68%  
Prediction           599        31.22%  
Feasibility          484        28.72%  


In [271]:
index_intersect = predictions_df[html_tags_mask].index.intersection(eval_df.index)
print_acc(eval_df.loc[index_intersect], accuracy)


Question type       Total      Accuracy 

Interaction         1179        44.61%  
Sequence            1792        47.88%  
Prediction           333        39.64%  
Feasibility          267        42.70%  


In [272]:
# % of answers conforming to template per category

def print_ans_perc(eval_df, gt_df):
    print(f"{'Question type':<15}{'Total':^15}{'Answered':^10}\n")

    total = gt_df.index.str.startswith('Interaction').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Interaction')].index))/total
    print(f"{'Interaction':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.index.str.startswith('Sequence').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Sequence')].index))/total
    print(f"{'Sequence':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.index.str.startswith('Prediction').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Prediction')].index))/total
    print(f"{'Prediction':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.index.str.startswith('Feasibility').sum()
    acc = len(gt_df.index.intersection(eval_df[eval_df.index.str.startswith('Feasibility')].index))/total
    print(f"{'Feasibility':<15}{total:^15}{acc:^10.2%}")

    total = gt_df.shape[0]
    acc = eval_df.shape[0]/total
    print(f"{'Overall':<15}{total:^15}{acc:^10.2%}")

print_ans_perc(eval_df, gt_df)


Question type       Total      Answered 

Interaction         2398        96.33%  
Sequence            3586        95.06%  
Prediction           624        95.99%  
Feasibility          490        98.78%  
Overall             7098        95.83%  
