In [1]:
import os, sys
from pathlib import Path
import json

import importlib

import pandas as pd

WORK_DIR = Path.cwd().parent

sys.path.append(str(WORK_DIR))

from src import (
    ollama_manager, main, datasets,
    prompt_formatters as pf
)

from src.STAR_utils.visualization_tools import qa_visualization as qaviz



In [2]:
STAR_val_filepath = WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_val.json"
STSG_filepath = WORK_DIR / "notebooks/outputs/out_file.jsonl"
STSG_filepath2 = WORK_DIR / "data/datasets/STAR_QA_and_stsg_val.json"



In [3]:
format_prompt_filepath = WORK_DIR / "data/prompts/zero-shot-cot/MCQ_user_prompt_ZS_CoT_v2.txt"
pformatter = pf.MCQPrompt(main._load_prompt_fromfile(format_prompt_filepath))
star_dataset = datasets.STARDataset(
    STAR_val_filepath,
    pformatter,
    STSG_filepath
)



Dataset Statistics:
QA File: STAR_val.json
Number of QA samples: 1
QA sample keys: question_id, question, video_id, start, end, answer, question_program, choices, situations, stsg

STSG File: out_file.jsonl
Number of unique video IDs with STSG: 1



In [4]:
star_dataset[0]


{'question_id': 'Interaction_T1_13',
 'question': 'Which object was tidied up by the person?',
 'video_id': '6H78U',
 'start': 11.1,
 'end': 19.6,
 'answer': 'The clothes.',
 'question_program': [{'function': 'Situations', 'value_input': []},
  {'function': 'Actions', 'value_input': []},
  {'function': 'Filter_Actions_with_Verb', 'value_input': ['tidy']},
  {'function': 'Unique', 'value_input': []},
  {'function': 'Query_Objs', 'value_input': []}],
 'choices': {'0': 'The closet/cabinet.',
  '1': 'The blanket.',
  '2': 'The clothes.',
  '3': 'The table.'},
 'situations': {'000206': {'rel_pairs': [['o000', 'o027'],
    ['o000', 'o019'],
    ['o000', 'o004'],
    ['o000', 'o004']],
   'rel_labels': ['r002', 'r002', 'r002', 'r003'],
   'actions': ['a004', 'a002', 'a056'],
   'bbox': [[167.51, 317.98, 226.33, 331.49],
    [140.16, 268.79, 211.72, 323.48],
    [162.93, 298.17, 228.31, 331.25]],
   'bbox_labels': ['o027', 'o019', 'o004']},
  '000212': {'rel_pairs': [['o000', 'o027'],
    ['o0

In [6]:
pformatter = pf.MCQPrompt(main._load_prompt_fromfile(format_prompt_filepath))
star_dataset = datasets.STARDataset(
    STAR_val_filepath,
    pformatter,
    STSG_filepath2
)



Dataset Statistics:
QA File: STAR_val.json
Number of QA samples: 7098
QA sample keys: question_id, question, video_id, start, end, answer, question_program, choices, situations, stsg

STSG File: STAR_QA_and_stsg_val.json
Number of unique video IDs with STSG: 7098



In [9]:
[i for i in star_dataset][0]


{'question_id': 'Interaction_T1_13',
 'question': 'Which object was tidied up by the person?',
 'video_id': '6H78U',
 'start': 11.1,
 'end': 19.6,
 'answer': 'The clothes.',
 'question_program': [{'function': 'Situations', 'value_input': []},
  {'function': 'Actions', 'value_input': []},
  {'function': 'Filter_Actions_with_Verb', 'value_input': ['tidy']},
  {'function': 'Unique', 'value_input': []},
  {'function': 'Query_Objs', 'value_input': []}],
 'choices': {'0': 'The closet/cabinet.',
  '1': 'The blanket.',
  '2': 'The clothes.',
  '3': 'The table.'},
 'situations': {'000206': {'rel_pairs': [['o000', 'o027'],
    ['o000', 'o019'],
    ['o000', 'o004'],
    ['o000', 'o004']],
   'rel_labels': ['r002', 'r002', 'r002', 'r003'],
   'actions': ['a004', 'a002', 'a056'],
   'bbox': [[167.51, 317.98, 226.33, 331.49],
    [140.16, 268.79, 211.72, 323.48],
    [162.93, 298.17, 228.31, 331.25]],
   'bbox_labels': ['o027', 'o019', 'o004']},
  '000212': {'rel_pairs': [['o000', 'o027'],
    ['o0

In [10]:
print(f"question_id: {star_dataset[0]['qid']}")
print(star_dataset[0]['prompt'])


question_id: Interaction_T1_13
Please carefully read the following Spatio-Temporal Scene Graph delimited by the <STSG> tags and provide an answer to the question below:

<STSG>
[['person - on_the_side_of - clothes'], ['person - in_front_of - clothes'], ['person - in_front_of - clothes', 'person - in_front_of - blanket'], ['person - in_front_of - towel', 'person - in_front_of - clothes', 'person - in_front_of - blanket', 'person - on_the_side_of - blanket'], ['person - on_the_side_of - towel', 'person - in_front_of - towel', 'person - in_front_of - clothes', 'person - on_the_side_of - blanket'], ['person - in_front_of - towel', 'person - on_the_side_of - clothes', 'person - in_front_of - blanket', 'person - on_the_side_of - blanket'], ['person - in_front_of - clothes', 'person - in_front_of - blanket', 'person - on_the_side_of - blanket'], ['person - in_front_of - towel', 'person - in_front_of - clothes', 'person - in_front_of - blanket'], ['person - in_front_of - towel', 'person - in_f

In [11]:
star_dataset[0].keys()


dict_keys(['question_id', 'question', 'video_id', 'start', 'end', 'answer', 'question_program', 'choices', 'situations', 'stsg', 'qid', 'prompt'])

In [13]:
star_dataset[0]['answer']


'The clothes.'

In [14]:
with open(WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_val.json") as in_file:
    star_data = json.load(in_file)

print(len(star_data))


7098


In [15]:
star_data[0]['question_id']


'Interaction_T1_13'

In [16]:
qaviz.Vis_SituationGraph(star_data[0], 1_000)


0 Frame ID: 000198
Subgraph:
	 Actions:
		 tidy some clothes ,take some clothes from somewhere
	 Relationships:
		 person  ----  on_the_side_of  ----  clothes


1 Frame ID: 000202
Subgraph:
	 Actions:
		 tidy some clothes ,take some clothes from somewhere
	 Relationships:
		 person  ----  in_front_of  ----  clothes


2 Frame ID: 000205
Subgraph:
	 Actions:
		 tidy some clothes ,take some clothes from somewhere ,put a blanket somewhere
	 Relationships:
		 person  ----  in_front_of  ----  clothes
		 person  ----  in_front_of  ----  blanket


3 Frame ID: 000206
Subgraph:
	 Actions:
		 tidy some clothes ,take some clothes from somewhere ,put a blanket somewhere
	 Relationships:
		 person  ----  in_front_of  ----  towel
		 person  ----  in_front_of  ----  clothes
		 person  ----  in_front_of  ----  blanket
		 person  ----  on_the_side_of  ----  blanket


4 Frame ID: 000212
Subgraph:
	 Actions:
		 tidy some clothes ,take some clothes from somewhere ,put a blanket somewhere
	 Relationships:
	

In [17]:
STSG_filepath3 = WORK_DIR / "outputs/gen_stsg_gemma3:12b-it-qat_20250522_14:30_99.jsonl"


In [18]:
stsgs = None
with open(STSG_filepath3, 'r') as f:
    stsgs = [json.loads(line) for line in f.readlines()]


In [19]:
stsgs[0]


{'video_id': '6H78U',
 'start': 15.6,
 'end': 22.7,
 'chat_history': [{'role': 'user',
   'content': 'Look carefully at this image and identify all objects and relationships present.\n\nFirst, list all distinct objects you can detect in the image. Be thorough and specific with your object labels (e.g., "young woman" rather than just "person", "wooden chair" rather than just "chair").\n\nThen, describe the key relationships between these objects in free-form text. Consider:\n- Spatial relationships (above, below, behind, inside, etc.)\n- Action-based relationships (holding, looking at, sitting on, etc.)\n- Physical connections (attached to, part of, touching, etc.)\n- Relative positions (next to, between, surrounding, etc.)\n\nThink step by step'},
  {'content': "Okay, let's break down this image.\n\n**1. Distinct Objects:**\n\n*   **Man:** A male individual, likely adult.\n*   **Spray Bottle:** A plastic spray bottle, possibly containing cleaning solution.\n*   **Sofa/Couch:** A fabric

In [20]:
stsgs_df = pd.DataFrame(stsgs)
stsgs_df.shape


(1094, 5)

In [21]:
stsgs_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   video_id      1094 non-null   object 
 1   start         1094 non-null   float64
 2   end           1094 non-null   float64
 3   chat_history  1094 non-null   object 
 4   stsg          1094 non-null   object 
dtypes: float64(2), object(3)
memory usage: 42.9+ KB


In [22]:
stsgs_df['video_id'] = stsgs_df['video_id'].astype('string')


In [23]:
stsgs_df[['video_id', 'start', 'end']].drop_duplicates().shape


(870, 3)

In [24]:
star_df = pd.DataFrame([i for i in star_dataset])
star_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7098 entries, 0 to 7097
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question_id       7098 non-null   object 
 1   question          7098 non-null   object 
 2   video_id          7098 non-null   object 
 3   start             7098 non-null   float64
 4   end               7098 non-null   float64
 5   answer            7098 non-null   object 
 6   question_program  7098 non-null   object 
 7   choices           7098 non-null   object 
 8   situations        7098 non-null   object 
 9   stsg              7098 non-null   object 
 10  qid               7098 non-null   object 
 11  prompt            7098 non-null   object 
dtypes: float64(2), object(10)
memory usage: 665.6+ KB


In [25]:
star_df['choices']


0       {'0': 'The closet/cabinet.', '1': 'The blanket...
1       {'0': 'The blanket.', '1': 'The table.', '2': ...
2       {'0': 'The pillow.', '1': 'The bag.', '2': 'Th...
3       {'0': 'The food.', '1': 'The shoe.', '2': 'The...
4       {'0': 'The broom.', '1': 'The closet/cabinet.'...
                              ...                        
7093    {'0': 'Wash the table.', '1': 'Take the box.',...
7094    {'0': 'Take the towel.', '1': 'Throw the bag.'...
7095    {'0': 'Throw the bag.', '1': 'Wash the table.'...
7096    {'0': 'Hold the food.', '1': 'Open the closet/...
7097    {'0': 'Wash the window.', '1': 'Hold the food....
Name: choices, Length: 7098, dtype: object

In [26]:
star_df['choices'].apply(lambda x: x['0']).value_counts()


choices
Put down.                   285
Took.                       253
The book.                   216
Threw.                      190
The clothes.                185
                           ... 
Throw the door.               1
Hold the mirror.              1
Wash the paper/notebook.      1
Hold the sandwich.            1
Hold the picture.             1
Name: count, Length: 232, dtype: int64

In [27]:
star_df['video_id'] = star_df['video_id'].astype('str')


In [28]:
star_df[['video_id', 'start', 'end']].drop_duplicates().shape


(3373, 3)

In [29]:
merged = star_df.merge(
    stsgs_df,
    on=['video_id', 'start', 'end'],
    how='right'
)

merged.shape


(2255, 14)

In [30]:
merged[['video_id', 'start', 'end']].drop_duplicates().shape


(870, 3)

In [31]:
star_dataset[0]['choices']


{'0': 'The closet/cabinet.',
 '1': 'The blanket.',
 '2': 'The clothes.',
 '3': 'The table.'}

In [32]:
with open(STAR_val_filepath, 'r') as f:
    test_d = json.load(f)


In [33]:
len([i['choices'][0] for i in test_d])


7098

In [34]:
importlib.reload(datasets)

pformatter = pf.MCQPrompt(main._load_prompt_fromfile(format_prompt_filepath))
star_dataset = datasets.STARDataset(
    STAR_val_filepath,
    pformatter,
    STSG_filepath3
)



Dataset Statistics:
QA File: STAR_val.json
Number of QA samples: 1694
QA sample keys: question_id, question, video_id, start, end, answer, question_program, choices, situations, stsg

STSG File: gen_stsg_gemma3:12b-it-qat_20250522_14:30_99.jsonl
Number of unique video IDs with STSG: 799



In [35]:
[star_dataset[i] for i in range(len(star_dataset))][0]


{'question_id': 'Interaction_T1_14',
 'question': 'Which object was tidied up by the person?',
 'video_id': '6H78U',
 'start': 15.6,
 'end': 22.7,
 'answer': 'The clothes.',
 'question_program': [{'function': 'Situations', 'value_input': []},
  {'function': 'Actions', 'value_input': []},
  {'function': 'Filter_Actions_with_Verb', 'value_input': ['tidy']},
  {'function': 'Unique', 'value_input': []},
  {'function': 'Query_Objs', 'value_input': []}],
 'choices': {'0': 'The blanket.',
  '1': 'The table.',
  '2': 'The clothes.',
  '3': 'The closet/cabinet.'},
 'situations': {'000289': {'rel_pairs': [['o000', 'o027'],
    ['o000', 'o027'],
    ['o000', 'o019'],
    ['o000', 'o019'],
    ['o000', 'o004'],
    ['o000', 'o004']],
   'rel_labels': ['r009', 'r002', 'r009', 'r002', 'r002', 'r003'],
   'actions': ['a004'],
   'bbox': [[188.53, 150.11, 223.07, 234.65],
    [186.43, 150.9, 224.52, 233.43],
    [192.04, 298.41, 207.44, 327.61],
    [182.59, 46.6, 318.39, 475.07]],
   'bbox_labels': [

In [37]:
star_df.iloc[324]['video_id']


'BN4VH'

In [40]:
sorted(star_df.iloc[324]['situations'].keys())


['000444', '000471', '000481', '000499', '000517', '000526', '000553']

In [36]:
star_df[star_df['video_id'] == '001YG']


Unnamed: 0,question_id,question,video_id,start,end,answer,question_program,choices,situations,stsg,qid,prompt
