In [1]:
import json
import pandas as pd

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import io
import os
import random
import re

# --- Configuration ---

# Placeholder paths - Replace with actual file paths
VIDEO_EXPRESSO_DATA_PATH = '/home/jamesdin/MMML/STAR/videoespresso_train_video.json' # Path to the text file containing the Video Expresso markdown table
STAR_ACTION_CLASSES_PATH = '/data/user_data/jamesdin/STAR/data/classes/action_classes.txt'
STAR_OBJECT_CLASSES_PATH = '/data/user_data/jamesdin/STAR/data/classes/object_classes.txt'
STAR_VERB_CLASSES_PATH = '/data/user_data/jamesdin/STAR/data/classes/verb_classes.txt'
STAR_RELATIONSHIP_CLASSES_PATH = '/data/user_data/jamesdin/STAR/data/classes/relationship_classes.txt' # Optional, for more complex sentences

OUTPUT_CSV_PATH = 'selected_star_similar_examples.csv' # Path to save the results

NUM_STAR_SAMPLES_FOR_TARGET = 100


# Model Selection:
# 'all-mpnet-base-v2' -> High quality, slower
# 'all-MiniLM-L6-v2'  -> Good quality, much faster, smaller
MODEL_NAME = 'all-mpnet-base-v2'

# Number of examples to select
N_SELECT = 3 # Using 3 for demonstration, change as needed

# Number of synthetic STAR sentences to generate for the target representation
NUM_TARGET_SENTENCES = 100

# Device for computation ('cuda' for GPU, 'cpu' for CPU)
DEVICE = 'cuda' # Change to 'cpu' if no GPU is available

# --- Helper Functions ---




[2025-04-22 21:29:10,169] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


In [6]:
import pandas as pd
import numpy as np

def load_class_list(filepath):
    """Loads class descriptions from a STAR dataset file."""
    classes = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                # Extract text after the class ID (e.g., "a000 hold some clothes" -> "hold some clothes")
                match = re.match(r'^[a-z]\d{3}\s+(.*)', line.strip())
                if match:
                    classes.append(match.group(1))
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}. Please provide the correct path.")
        return None
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return None
    return classes

In [46]:
def load_class_map(filepath):
    """Loads STAR class IDs and descriptions into a dictionary."""
    class_map = {}
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split(maxsplit=1)
                if len(parts) == 2:
                    # Use class ID as key, description as value
                    class_map[parts[0]] = parts[1]
    except FileNotFoundError:
        print(f"Warning: Class mapping file not found at {filepath}. Descriptions will be missing.")
    except Exception as e:
        print(f"Warning: Error reading class mapping file {filepath}: {e}")
    return class_map
    
def format_star_choices(choices_str):
    """Formats the 'choices' string into a readable sentence."""
    try:
        choices_list = ast.literal_eval(choices_str)
        if isinstance(choices_list, list):
            choice_texts = [item.get('choice', '') for item in choices_list if isinstance(item, dict)]
            return "Choices are: " + "; ".join(filter(None, choice_texts)) + "."
        return "Choices could not be parsed."
    except (ValueError, SyntaxError, TypeError):
        # Handle cases where the string is not a valid list literal or structure is unexpected
        # print(f"Warning: Could not parse choices string: {choices_str}")
        return "Choices format unclear."


def format_star_situations(situations_str, action_map, object_map, relation_map):
    """Formats the 'situations' string into a readable paragraph."""
    situation_sentences = []
    try:
        situations_dict = ast.literal_eval(situations_str)
        if not isinstance(situations_dict, dict):
            return "Situations format unclear."

        # Aggregate information across all frames for simplicity, or process frame by frame
        all_actions = set()
        all_relations = []

        for frame_id, frame_data in situations_dict.items():
            if isinstance(frame_data, dict):
                # Collect unique actions
                for action_id in frame_data.get('actions',):
                    all_actions.add(action_id)

                # Collect unique relations
                rel_pairs = frame_data.get('rel_pairs',)
                rel_labels = frame_data.get('rel_labels',)
                if len(rel_pairs) == len(rel_labels):
                    for pair, label in zip(rel_pairs, rel_labels):
                        if isinstance(pair, list) and len(pair) == 2:
                            subj_id, obj_id = pair
                            relation_tuple = (subj_id, label, obj_id)
                            if relation_tuple not in all_relations:
                                 all_relations.append(relation_tuple)

        # Generate sentences for actions
        person_text = object_map.get('o000', 'person') # Default subject
        for action_id in all_actions:
            action_text = action_map.get(action_id, action_id) # Use ID if mapping fails
            situation_sentences.append(f"{person_text.capitalize()} {action_text}.")

        # Generate sentences for relations
        for subj_id, rel_id, obj_id in all_relations:
             subj_text = object_map.get(subj_id, subj_id)
             rel_text = relation_map.get(rel_id, rel_id)
             obj_text = object_map.get(obj_id, obj_id)
             # Handle cases like 'person on person' which might be less informative
             if subj_text!= obj_text or subj_id!= 'o000': # Avoid self-relations unless specific
                 situation_sentences.append(f"{subj_text.capitalize()} {rel_text} {obj_text}.")

        if not situation_sentences:
            return "No specific situations described."

        # Remove duplicates while preserving order (important for readability)
        seen = set()
        unique_sentences = [s for s in situation_sentences if not (s in seen or seen.add(s))]
        return "Situation: " + " ".join(unique_sentences)

    except (ValueError, SyntaxError, TypeError):
        # Handle cases where the string is not a valid dict literal or structure is unexpected
        # print(f"Warning: Could not parse situations string: {situations_str}")
        return "Situations format unclear."


def format_star_example(star_row, action_map, object_map, relation_map):
    """Combines question, answer, choices, and situations into one text block."""
    question = star_row.get('question', '')
    answer = star_row.get('answer', '')
    choices_str = star_row.get('choices', '')
    situations_str = str(star_row.get('situations', '{}'))

    formatted_choices = format_star_choices(choices_str)
    formatted_situations = format_star_situations(situations_str, action_map, object_map, relation_map)

    # Combine all parts
    full_text = f"Question: {question}\nAnswer: {answer}\n{formatted_choices}\n{formatted_situations}"
    return full_text.strip()



In [47]:


def synthesize_star_sentences(actions, objects, verbs, num_sentences=100):
    """Generates simple sentences representing STAR interactions."""
    sentences = []
    if not actions and (not verbs or not objects):
        print("Warning: Not enough class information to synthesize sentences.")
        return

    # Strategy 1: Use full action descriptions if available
    if actions:
        sentences.extend([f"A person {action}." for action in actions])

    # Strategy 2: Combine verbs and objects
    if verbs and objects:
        for _ in range(num_sentences):
            verb = random.choice(verbs)
            obj = random.choice(objects)
            # Basic sentence structure
            sentence = f"A person {verb} {obj}."
            # Avoid adding exact duplicates immediately, though some overlap is fine
            if sentence not in sentences:
                 sentences.append(sentence)

    # Ensure we have roughly the desired number, prioritizing unique combinations
    if len(sentences) > num_sentences:
        return random.sample(sentences, num_sentences)
    elif len(sentences) < num_sentences and verbs and objects:
         # Add more verb-object combinations if needed and possible
         needed = num_sentences - len(sentences)
         for _ in range(needed * 2): # Generate more to increase chance of new ones
             verb = random.choice(verbs)
             obj = random.choice(objects)
             sentence = f"A person {verb} {obj}."
             if sentence not in sentences:
                 sentences.append(sentence)
             if len(sentences) >= num_sentences:
                 break

    # Final shuffle and trim if slightly over
    random.shuffle(sentences)
    return sentences[:num_sentences]



In [48]:
# --- Main Execution ---

# 1. Load Video Expresso Data
print(f"Loading Video Expresso data from: {VIDEO_EXPRESSO_DATA_PATH}")

with open(VIDEO_EXPRESSO_DATA_PATH, "r") as f:
    data = json.load(f)
    df_expresso = pd.DataFrame(data)

print(f"Loaded {len(df_expresso)} examples from Video Expresso data.")

df_expresso['combined_text'] = df_expresso['question'] + "\nAnswer: " + df_expresso['answer']
video_expresso_texts = df_expresso['combined_text'].tolist()
print(f"Loaded and processed {len(df_expresso)} examples from Video Expresso data.")

df_expresso.head()

Loading Video Expresso data from: /home/jamesdin/MMML/STAR/videoespresso_train_video.json
Loaded 200766 examples from Video Expresso data.
Loaded and processed 200766 examples from Video Expresso data.


Unnamed: 0,question,answer,key_item,evidence,task,video_path,keyframes_path,combined_text
0,Compare and contrast the subjects and the focu...,"The first frame, at 203 seconds, focuses on a ...","[[bear], [flower, branch]]",The first frame highlights a <obj_start>bear i...,Narrative Analysis,Moviechat/videos/1/AWG-5.mp4,"[Moviechat/videos/1_image/AWG-5/6100.jpg, Movi...",Compare and contrast the subjects and the focu...
1,Connect George's daydream and subsequent slipp...,"George's daydream about floating bananas, duri...","[[joy, banana], [dog, banana peel], [puddle, m...",George's daydream about a <obj_start>banana in...,Event Dynamic Analysis,Storystream/George/000277/output_videos/000277...,[Storystream/George/000277/000277_keyframe_0-2...,Connect George's daydream and subsequent slipp...
2,How did George transition from his involvement...,George seems to transition from a contemplativ...,"[[easel], [desert, rock formations]]",George's transition from the basin near the <o...,Event Dynamic Analysis,Storystream/George/000140/output_videos/000140...,[Storystream/George/000140/000140_keyframe_0-1...,How did George transition from his involvement...
3,"Using the provided times, describe the steps i...",To prepare the mixture for Savoury Potato Panc...,"[[mashed potatoes, bowl], [butter], [bowl, milk]]",The mixture for Savoury Potato Pancakes starts...,Preparation Steps / Ingredient Analysis,Youcook2/merged/2/N35UyfIwhVI.mp4,"[Youcook2/merged/2_image/N35UyfIwhVI/525.jpg, ...","Using the provided times, describe the steps i..."
4,How do the two nocturnal foraging behaviors of...,The raccoons are depicted foraging on the grou...,"[[environment, raccoons], [echolocation, bats]]",The <obj_start>raccoons in frame 1<obj_end><bo...,Causal Analysis,Moviechat/videos/4/BWB-4.mp4,"[Moviechat/videos/4_image/BWB-4/9850.jpg, Movi...",How do the two nocturnal foraging behaviors of...


In [50]:


# 2. Load STAR Data and Class Mappings
print("Loading STAR dataset and class mappings...")

import pickle
# Load the .pkl file
with open('/data/user_data/jamesdin/STAR/data/STAR_val.pkl', 'rb') as f:
    data = pickle.load(f)
# Convert to DataFrame and set index
df_star = pd.DataFrame(data).set_index('question_id')

action_map = load_class_map(STAR_ACTION_CLASSES_PATH)
object_map = load_class_map(STAR_OBJECT_CLASSES_PATH)
relation_map = load_class_map(STAR_RELATIONSHIP_CLASSES_PATH)

relation_map

Loading STAR dataset and class mappings...


{'r000': 'on',
 'r001': 'behind',
 'r002': 'in_front_of',
 'r003': 'on_the_side_of',
 'r004': 'above',
 'r005': 'beneath',
 'r006': 'drinking_from',
 'r007': 'have_it_on_the_back',
 'r008': 'wearing',
 'r009': 'holding',
 'r010': 'lying_on',
 'r011': 'covered_by',
 'r012': 'carrying',
 'r013': 'eating',
 'r014': 'leaning_on',
 'r015': 'sitting_on',
 'r016': 'twisting',
 'r017': 'writing_on',
 'r018': 'standing_on',
 'r019': 'touching',
 'r020': 'wiping',
 'r021': 'at',
 'r022': 'under',
 'r023': 'near'}

In [58]:
import ast

# 3. Select and Format STAR Examples for Target Representation
print(f"Selecting and formatting {NUM_STAR_SAMPLES_FOR_TARGET} STAR examples for target representation...")
if len(df_star) < NUM_STAR_SAMPLES_FOR_TARGET:
    print(f"Warning: Requested {NUM_STAR_SAMPLES_FOR_TARGET} STAR samples, but only {len(df_star)} available. Using all available.")
    star_samples = df_star
else:
    star_samples = df_star.sample(n=NUM_STAR_SAMPLES_FOR_TARGET, random_state=42) # Use random_state for reproducibility

target_texts = [format_star_example(row, action_map, object_map, relation_map) for _, row in star_samples.iterrows()]
print(f"Formatted {len(target_texts)} STAR examples.")
# print("\nSample Formatted STAR Example:")
# print(target_texts) # Optional: view a sample

target_texts[0]

Selecting and formatting 100 STAR examples for target representation...
Formatted 100 STAR examples.


'Question: Which object did the person throw before they held the phone/camera?\nAnswer: The clothes.\nChoices format unclear.\nSituation: Person throw clothes somewhere. Person hold a phone/camera. Person holding clothes. Person in_front_of clothes. Person standing_on floor. Person beneath floor. Person touching clothes. Person holding phone/camera. Person in_front_of phone/camera.'

In [53]:
model = SentenceTransformer(MODEL_NAME, device=DEVICE)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [54]:
target_embeddings = model.encode(
    target_texts,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=128 # Adjust based on GPU memory
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
video_expresso_embeddings = model.encode(
    video_expresso_texts,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32 # Adjust based on GPU memory
)

Batches:   0%|          | 0/6274 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 36.00 MiB. GPU 0 has a total capacity of 47.43 GiB of which 28.56 MiB is free. Process 1149652 has 11.73 GiB memory in use. Process 1149686 has 12.58 GiB memory in use. Including non-PyTorch memory, this process has 23.08 GiB memory in use. Of the allocated memory 22.74 GiB is allocated by PyTorch, and 34.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [64]:
 # 6. Calculate Average Target Embedding
print("Calculating average STAR target embedding...")
if target_embeddings.shape[0] > 0:
    target_embedding_avg = np.mean(target_embeddings, axis=0)
    norm = np.linalg.norm(target_embedding_avg)
    if norm > 0:
         target_embedding_avg = target_embedding_avg / norm
    else:
        print("Warning: Average target embedding has zero norm.")
        target_embedding_avg = np.zeros(model.get_sentence_embedding_dimension())

    target_embedding_avg_2d = target_embedding_avg.reshape(1, -1)
    print("Average STAR target embedding generated.")
else:
    print("Error: No target STAR embeddings were generated.")
    exit()


Calculating average STAR target embedding...
Average STAR target embedding generated.


In [65]:
# Compare Video Expresso embeddings against the average STAR target embedding
similarity_scores_matrix = cosine_similarity(target_embedding_avg_2d, video_expresso_embeddings)
similarity_scores = similarity_scores_matrix.flatten()

NameError: name 'video_expresso_embeddings' is not defined

In [None]:

# 8. Rank and Select Video Expresso Examples
print(f"Ranking Video Expresso examples and selecting top {N_SELECT}...")
df_expresso['similarity_to_star'] = similarity_scores
df_ranked = df_expresso.sort_values(by='similarity_to_star', ascending=False).reset_index(drop=True)

# Select the original columns plus the new similarity score for the output
output_columns = [col for col in df_expresso.columns if col not in ['combined_text', 'similarity_to_star']] + ['similarity_to_star']
selected_examples_df = df_ranked.head(N_SELECT)[output_columns]


print(f"\n--- Selected Top {N_SELECT} Video Expresso Examples (Most Similar to STAR Samples) ---")
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
    print(selected_examples_df)

# 9. Save Results (Optional)
try:
    selected_examples_df.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"\nSelected examples saved to: {OUTPUT_CSV_PATH}")
except Exception as e:
    print(f"\nError saving results to CSV: {e}")

print("\nProcess completed.")