In [3]:
import os, sys
from pathlib import Path
import json

import pandas as pd

WORK_DIR = Path.cwd().parent

sys.path.append(str(WORK_DIR))

from src import (
    main,
    datasets,
    frames_tools,
    ollama_manager,
    prompt_formatters as pf
)
from src.STAR_utils.visualization_tools import qa_visualization as qaviz



In [2]:
STAR_TRAIN = WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_train.json"
STAR_VAL = WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_val.json"
STAR_TEST = WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_test.json"
STAR_SMALL = WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_val_small_1000.json"

RAW_FRAMES_DIR = WORK_DIR / "data/datasets/action-genome/frames"
KEYFRAMES_INFO_PATH = WORK_DIR / "data/datasets/STAR/Video_Keyframe_IDs.csv"

RAW_VIDEO_DIR = Path(WORK_DIR / 'data/datasets/action-genome/Charades_v1_480/')
SAVE_VIDEO_DIR = Path(WORK_DIR / "experiments/video_dump")


In [11]:
with open(KEYFRAMES_INFO_PATH, 'r') as f:
    keyframes_info = pd.read_csv(f)

keyframes_info.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60206 entries, 0 to 60205
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question_id   60206 non-null  object
 1   video_id      60206 non-null  object
 2   Keyframe_IDs  60206 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [12]:
keyframes_info['Keyframe_IDs'] = \
    keyframes_info['Keyframe_IDs'] \
    .apply(eval) \
    .apply(sorted)


In [44]:
keyframes_info['kid_str'] = \
    keyframes_info[['video_id', 'Keyframe_IDs']] \
    .apply(lambda row: "_".join([row['video_id']] + row['Keyframe_IDs']), axis=1)


In [45]:
keyframes_info.iloc[0]['kid_str']


'YSKX3_000205_000209_000221_000223_000224_000235_000237_000239_000247_000252_000254_000258_000268_000269_000270'

In [47]:
keyframes_info['kid_str'].nunique()


21052

In [18]:
with open(STAR_VAL, 'r') as f:
    star_val_df = pd.read_json(f)

star_val_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7098 entries, 0 to 7097
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question_id       7098 non-null   object 
 1   question          7098 non-null   object 
 2   video_id          7098 non-null   object 
 3   start             7098 non-null   float64
 4   end               7098 non-null   float64
 5   answer            7098 non-null   object 
 6   question_program  7098 non-null   object 
 7   choices           7098 non-null   object 
 8   situations        7098 non-null   object 
dtypes: float64(2), object(7)
memory usage: 499.2+ KB


In [20]:
star_val_df.duplicated(subset=['video_id', 'start', 'end']).value_counts()


True     3725
False    3373
Name: count, dtype: int64

In [49]:
star_val_df['kid_str'] = \
    star_val_df[['video_id', 'situations']] \
    .apply(lambda row: "_".join([row['video_id']] + sorted(list(row['situations'].keys()))), axis=1)
    
star_val_df.iloc[0]['kid_str']


'6H78U_000198_000202_000205_000206_000212_000217_000218_000221_000223_000236_000240_000242_000243_000245_000247_000257_000267_000269_000270_000272_000274_000275_000286_000289_000294_000301_000303_000308_000313_000314_000316_000318_000320_000322_000331_000332_000336_000337_000339'

In [50]:
star_val_df['kid_str'].nunique()


2881

In [62]:
star_val_df\
    .groupby(by=['video_id', 'start', 'end'])['kid_str']\
    .agg(lambda x: len(set(x)))\
    .value_counts()


kid_str
1    3248
2     116
3       9
Name: count, dtype: int64

In [69]:
star_val_df\
    .groupby(by=['kid_str'])[['start', 'end']]\
    .apply(lambda g: g.drop_duplicates().shape[0])\
    .value_counts()


1    2438
2     314
3      92
4      25
5       8
6       3
7       1
Name: count, dtype: int64

In [51]:
keyframes_info[keyframes_info['question_id'].isin(star_val_df['question_id'])].info()


<class 'pandas.core.frame.DataFrame'>
Index: 7098 entries, 12 to 60202
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question_id   7098 non-null   object
 1   video_id      7098 non-null   object
 2   Keyframe_IDs  7098 non-null   object
 3   kid_str       7098 non-null   object
dtypes: object(4)
memory usage: 277.3+ KB


In [52]:
keyframes_info[keyframes_info['question_id'].isin(star_val_df['question_id'])]['kid_str'].nunique()


3101

----
# Graph Gen

In [71]:
sys_file_path = WORK_DIR / "data/prompts/graph-gen/system_prompt.txt"

sys_prompt = main._load_prompt_fromfile(sys_file_path)
# model_options = _load_model_options()
ollama_params = {
    "model": "gemma3:4b-it-qat",
    "system": sys_prompt,
    "stream": True,
    "options": {
        "num_ctx": 10240,
        "temperature": 0.1,
        "num_predict": 1024,
        "seed": 42,
    },
}

url = os.environ.get("OLLAMA_URL", "http://lusha_ollama:11435")
client = ollama_manager.OllamaRequestManager(url, ollama_params)


In [72]:
# pre-load the model
client.generate_completion("hello")


Hello! I’m ready to analyze an image and generate a scene graph. Please provide me with the image. I’ll do my best to provide a detailed and structured analysis. 😊 

Just let me know how you’d like to share the image – you can describe it in detail, or if possible, provide a link to it.

Response at: 51.7 tk/s


'Hello! I’m ready to analyze an image and generate a scene graph. Please provide me with the image. I’ll do my best to provide a detailed and structured analysis. 😊 \n\nJust let me know how you’d like to share the image – you can describe it in detail, or if possible, provide a link to it.'

In [15]:
prompt1 = """\
What's the image describing??

Think step by step.
"""

img = video_encodings[0]['frames'][0]['encoding']

msg1 = {
    "role": "user",
    "content": prompt1,
    "images": [img]
}
resp = client.ollama_completion_request(
            endpoint='chat',
            payload={
                "model": "gemma3:12b",
                "messages": [msg1],
                "options": {
                    "num_ctx": 10_000
                }
            }
        )


Okay, let's break down what's happening in this image step-by-step:

1. **Subject:** The main focus is a person, likely a young boy.
2. **Action:** The boy is actively writing or drawing on a paper that's in front of him. He's holding a writing utensil (pen/pencil) and appears focused.
3. **Position:** He's sitting directly on the floor, rather than on a chair.
4. **Setting:** The background is a simple, plain wall. There are some items on a table beside him, including what looks like a bag.

**Overall Description:**

The image shows a young boy sitting on the floor and writing or drawing on a piece of paper, likely studying or doing homework. The setting seems fairly simple and possibly quite humble.

Response at: 24.6 tk/s


In [18]:
images = [frame['encoding'] for frame in video_encodings[0]['frames']]

msg1 = {
    "role": "user",
    "content": prompt1,
    "images": images
}


resp = client.ollama_completion_request(
            endpoint='chat',
            payload={
                "model": "gemma3:12b-it-qat",
                "messages": [msg1],
                "options": {
                    "num_ctx": 10_000
                }
            }
        )


Okay, let's analyze the image step by step:

1.  **Subject:** The central figure is a young person, likely a boy or teenager.
2.  **Activity:** The person is sitting on the floor, leaning on a small table and writing on a piece of paper. They seem focused on their task.
3.  **Setting:** The setting appears to be a simple, perhaps makeshift, workspace. There's a small table, and a pile of clothes or bags behind them, suggesting a basic or temporary setup.
4.  **Overall impression:** The image conveys a sense of diligence or studying, despite the less-than-ideal workspace conditions.

**Therefore, the image describes a person (likely a student) studying or writing in a simple, perhaps resource-constrained, environment.**

Response at: 21.5 tk/s


In [77]:
dataset = star_val_df.iloc[:5][['question_id', 'video_id']].to_dict(orient='records')
dataset


[{'question_id': 'Interaction_T1_13', 'video_id': '6H78U'},
 {'question_id': 'Interaction_T1_14', 'video_id': '6H78U'},
 {'question_id': 'Interaction_T1_31', 'video_id': 'RNLTR'},
 {'question_id': 'Interaction_T1_32', 'video_id': 'RNLTR'},
 {'question_id': 'Interaction_T1_40', 'video_id': 'VNQTH'}]

In [80]:
keyframes_info = frames_tools.extract_video_keyframes_info(KEYFRAMES_INFO_PATH)
videos_info = frames_tools.preprocess_videos_metadata(dataset, keyframes_info, filter=False)


In [93]:
reload(frames_tools)


<module 'src.frames_tools' from '/home/lusha/star_code/src/frames_tools.py'>

In [94]:
usr_prompt = main._load_prompt_fromfile(WORK_DIR / "data/prompts/graph-gen/usr_prompt.txt")
reply = main._load_prompt_fromfile(WORK_DIR / "data/prompts/graph-gen/format_instructions.txt")
# Run frame generation
frames_tools.streaming_frame_generation(
    client,
    RAW_FRAMES_DIR, 
    "outputs/out_file.jsonl", 
    videos_info=videos_info,
    usr_prompt=usr_prompt,
    reply=reply, 
    max_samples=5
)


File outputs/out_file.jsonl already exists!
Responses will be saved to: outputs/out_file_copy3.jsonl
Errors will be logged to: outputs/errors_20250623_21:09:16.txt

Video: 6H78U
 - interval: None-None
 - 5 frames.

Generating response for iteration 1 - id: 6H78U
Okay, let's break down the image and analyze the objects and their relationships.

**List of Distinct Objects:**

1.  **Person:** A young man is the primary subject.
2.  **Chair:** A large, ornate, upholstered armchair. It appears to be a dark brown or burgundy color.
3.  **Floor:** A tiled floor, likely ceramic or porcelain, with a neutral color (possibly beige or light gray).
4.  **Wall:** A plain, white wall.
5.  **Light Fixture:** A single, recessed light fixture is visible on the wall.
6.  **Tile:** Individual tiles that make up the floor.
7.  **Shadow:** A shadow cast by the person and the chair.

**Description of Relationships:**

The scene depicts a young man standing in front of a large armchair. Here’s a breakdown of 