In [1]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adagrad
from tqdm import tqdm
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import random
from collections import defaultdict
import warnings
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import re
import bisect
import shutil
import json
from time import perf_counter

# warnings.filterwarnings("ignore")

from IPython.display import HTML
import os
import pickle
from sentence_transformers import SentenceTransformer
import av
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor
from huggingface_hub import hf_hub_download




### Chain-of-Thought Reasoning

In [2]:

load_in_bits = 4
device = 'cuda'
compute_dtype = 'fp16'
double_quant = True
quant_type = 'nf4'

compute_dtype = (torch.float16 if compute_dtype == 'fp16' else (torch.bfloat16 if compute_dtype == 'bf16' else torch.float32))

bnb_model_from_pretrained_args = {}
if load_in_bits in [4, 8]:
    from transformers import BitsAndBytesConfig
    bnb_model_from_pretrained_args.update(dict(
        device_map={"": device},
        # load_in_4bit=load_in_bits == 4,
        # load_in_8bit=load_in_bits == 8,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=load_in_bits == 4,
            load_in_8bit=load_in_bits == 8,
            llm_int8_skip_modules=["mm_projector"],
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=double_quant,
            bnb_4bit_quant_type=quant_type # {'fp4', 'nf4'}
        )
    ))

model = VideoLlavaForConditionalGeneration.from_pretrained(
    "LanguageBind/Video-LLaVA-7B-hf",
    torch_dtype=compute_dtype,
    attn_implementation="flash_attention_2",
    **bnb_model_from_pretrained_args
)

processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from IPython.display import HTML
import base64

def display_video_embed(video_id, video_dir="/data/user_data/jamesdin/STAR/data/Charades_v1_480"):
    video_path = os.path.join(video_dir, f"{video_id}.mp4")

    print("Embedding video:", video_path)
    
    with open(video_path, "rb") as f:
        video_encoded = base64.b64encode(f.read()).decode("utf-8")
    
    html = f'''
    <video width="320" height="240" controls>
        <source src="data:video/mp4;base64,{video_encoded}" type="video/mp4">
        Your browser does not support the video tag.
    </video>
    '''
    
    return HTML(html)

In [18]:
import pickle
import pandas as pd

# Load the .pkl file
with open('/data/user_data/jamesdin/STAR/data/STAR_val.pkl', 'rb') as f:
    data = pickle.load(f)

# Convert to DataFrame and set index
df = pd.DataFrame(data).set_index('question_id')



                                                    question video_id  start  \
question_id                                                                    
Interaction_T1_13  Which object was tidied up by the person?    6H78U   11.1   
Interaction_T1_14  Which object was tidied up by the person?    6H78U   15.6   
Interaction_T1_31     Which object was thrown by the person?    RNLTR    7.3   
Interaction_T1_32   Which object was put down by the person?    RNLTR   11.6   
Interaction_T1_40  Which object was tidied up by the person?    VNQTH    2.7   

                    end               answer  \
question_id                                    
Interaction_T1_13  19.6         The clothes.   
Interaction_T1_14  22.7         The clothes.   
Interaction_T1_31  13.5         The clothes.   
Interaction_T1_32  16.4            The shoe.   
Interaction_T1_40   9.2  The closet/cabinet.   

                                                    question_program  \
question_id                   

In [19]:
df

Unnamed: 0_level_0,question,video_id,start,end,answer,question_program,choices,situations
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Interaction_T1_13,Which object was tidied up by the person?,6H78U,11.1,19.6,The clothes.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'The closet/cabine...","{'000206': {'rel_pairs': [['o000', 'o027'], ['..."
Interaction_T1_14,Which object was tidied up by the person?,6H78U,15.6,22.7,The clothes.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'The blanket.', 'c...","{'000289': {'rel_pairs': [['o000', 'o027'], ['..."
Interaction_T1_31,Which object was thrown by the person?,RNLTR,7.3,13.5,The clothes.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'The pillow.', 'ch...","{'000132': {'rel_pairs': [['o000', 'o019'], ['..."
Interaction_T1_32,Which object was put down by the person?,RNLTR,11.6,16.4,The shoe.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'The food.', 'choi...","{'000202': {'rel_pairs': [['o000', 'o019'], ['..."
Interaction_T1_40,Which object was tidied up by the person?,VNQTH,2.7,9.2,The closet/cabinet.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'The broom.', 'cho...","{'000110': {'rel_pairs': [['o000', 'o014'], ['..."
...,...,...,...,...,...,...,...,...
Feasibility_T6_1453,What is the person able to do after taking the...,L9ANI,7.3,12.9,Wash the table.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'Wash the table.',...","{'000227': {'rel_pairs': [['o000', 'o006'], ['..."
Feasibility_T6_1454,What is the person able to do after walking th...,L9ANI,0.0,4.1,Take the towel.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'Take the towel.',...","{'000033': {'rel_pairs': [['o000', 'o006']], '..."
Feasibility_T6_1455,What is the person able to do after walking th...,L9ANI,0.0,4.1,Wash the table.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'Throw the bag.', ...","{'000033': {'rel_pairs': [['o000', 'o006']], '..."
Feasibility_T6_1456,What is the person able to do after putting th...,L9ANI,0.0,10.8,Wash the table.,"[{'function': 'Situations', 'value_input': []}...","[{'choice_id': 0, 'choice': 'Hold the food.', ...","{'000033': {'rel_pairs': [['o000', 'o006']], '..."


In [35]:

def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def read_video_pyav2(video_path, start, end, num_frames=8):
        """Reads a video for given start-end timestamps interval and uniformly samples 8 frames of it"""
        container = av.open(video_path)
        video = container.streams.get(0)[0]

        av_timestamps = [
            int(packet.pts * video.time_base)
            for packet in container.demux(video)
            if packet.pts is not None
        ]

        av_timestamps.sort()
        start_id = bisect.bisect_left(av_timestamps, start)
        end_id = bisect.bisect_left(av_timestamps, end)

        # in case it is a very short video, lets take a longer duration and sample
        if end_id - start_id < 10:
            end_id += 10
            start_id -= 10

        end_id = min(len(av_timestamps) - 1, end_id)
        start_id = max(1, start_id)

        # We sample 8 frames for tuning following the original paper
        # But we can increase the number of frames for longer videos and check out if it helps performance
        # Change the below "8" to any number of frames you want, and note that more frames -> more computational resources needed
        indices = np.linspace(start_id, end_id, num_frames).astype(int)

        frames = []
        container.seek(0)
        for i, frame in enumerate(container.decode(video=0)):
            if i > end_id:
                break
            if i >= start_id and i in indices:
                frames.append(frame)
        assert (
            len(frames) == num_frames
        ), f"Got {len(frames)} frames but should be {num_frames}. Check the indices: {indices};, start_id: {start_id}, end_id: {end_id}. Len of video is {len(av_timestamps)} frames."
        return np.stack([x.to_ndarray(format="rgb24") for x in frames]), indices


In [74]:
question_id = 'Interaction_T1_43'

example = df.loc[question_id]

video_id = example['video_id']
question = example['question']
choices = [x['choice'] for x in example['choices']]
start = example['start']
end = example['end']

In [75]:
display_video_embed(video_id)

Embedding video: /data/user_data/jamesdin/STAR/data/Charades_v1_480/Y79PC.mp4


In [76]:
# sample uniformly 8 frames from the video

video_path = f"/data/user_data/jamesdin/STAR/data/Charades_v1_480/{video_id}.mp4"

# container = av.open(video_path)
# total_frames = container.streams.video[0].frames
# indices = np.arange(0, total_frames, total_frames / 8).astype(int)
# video_frames = read_video_pyav(container, indices)

total_frames = 8
video_frames, frame_idx = read_video_pyav2(video_path, start, end, num_frames=total_frames)


In [77]:
prompt = f"""USER: <video>
Question: {question}
Choices:
{''.join([f"{i+1}. {choice}" + chr(10) for i, choice in enumerate(choices)])}
Please carefully watch the video and reason through the visual content to answer the question based on the given choices.

First, explain your reasoning in a few sentences describing what you observed in the video and how it relates to the question. Then, clearly state your final answer using the format:
Answer: <index>

ASSISTANT:
"""


In [82]:

inputs = processor(text=prompt, videos=video_frames, return_tensors="pt").to('cuda')

# Generate
outputs = model.generate(**inputs, max_new_tokens=100, return_dict_in_generate=True, output_scores=True)

decoded = processor.batch_decode(outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(decoded)

USER: 
Question: Which object was thrown by the person?
Choices:
1. The box.
2. The pillow.
3. The broom.
4. The clothes.

Please carefully watch the video and reason through the visual content to answer the question based on the given choices.

First, explain your reasoning in a few sentences describing what you observed in the video and how it relates to the question. Then, clearly state your final answer using the format:
Answer: <index>

ASSISTANT:
In the video, a person is seen throwing a pillow at another person who is standing in front of a window. The person who threw the pillow is wearing a black dress, while the other person is wearing a striped shirt. The scene suggests that the person who threw the pillow might have been angry or frustrated with the other person.

Answer: 2


In [83]:
last_non_eos_token_probs = torch.nn.functional.softmax(outputs.scores[-2], dim=-1)
        
# Get token IDs for numbers 1-4
token_ids = [processor.tokenizer.convert_tokens_to_ids(str(i)) for i in [1,2,3,4]]

# Create probability dictionary for each sample in batch
prob_list = []
for batch_idx in range(last_non_eos_token_probs.shape[0]):
    probs = [
        last_non_eos_token_probs[batch_idx, token_ids[i]].item()
        for i in range(4)
    ]
    prob_list.append(probs)

logits_list = []
for batch_idx in range(outputs.scores[-2].shape[0]):
    logits = [
        outputs.scores[-2][batch_idx, token_ids[i]].item()
        for i in range(4)
    ]
    logits_list.append(logits)

In [84]:
prob_list

[[0.2286074310541153,
  0.343180388212204,
  0.2800956666469574,
  0.14531172811985016]]

In [85]:
logits_list

[[18.390625, 18.796875, 18.59375, 17.9375]]

In [86]:
match = re.search(r"Answer:\s*(\d+)", decoded)
if match:
    answer_index = int(match.group(1)) - 1

answer_index

1

## Video-of-Thought (Multi-staged reasoning)

In [66]:
import torch
import re
from typing import List, Dict, Any, Optional, Union
import numpy as np
from time import perf_counter
import json
from tqdm import tqdm
import os


class VideoOfThoughtPredictor:
    def __init__(self, video_llava_model, video_llava_processor):
        """
        Initialize the Video-of-Thought predictor with a VideoLLAVA model.
        
        Args:
            video_llava_model: An instance of the VideoLLAVA model class
        """
        self.model = video_llava_model
        self.processor = video_llava_processor

        self.show_intermediate_steps = False
        
    def _generate_response(self, video_frames, prompt, max_new_tokens=100):
        """
        Generate a response from the VideoLLAVA model.
        
        Args:
            video_frames: Video frame tensors
            prompt: Text prompt
            max_new_tokens: Maximum number of tokens to generate
            
        Returns:
            Generated text response
        """
        inputs = self.processor(
            text=prompt, 
            videos=video_frames, 
            return_tensors="pt", 
            max_length=4096
        ).to("cuda")
        
        # Use more controlled generation parameters
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            # do_sample=False,  # Use greedy decoding for more consistent outputs
            # temperature=0.1,   # Lower temperature for more focused responses
            # num_beams=1,
            # early_stopping=True,
            # pad_token_id=self.model.processor.tokenizer.pad_token_id,
            # eos_token_id=self.model.processor.tokenizer.eos_token_id
        )

        
        decoded = self.processor.batch_decode(
            outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        response = decoded[0]
        if "ASSISTANT:" in response:
            response = response.split("ASSISTANT:")[-1].strip()
        
        # Extract just the assistant's response, removing the prompt
        if self.show_intermediate_steps:
            print('#' * 50)
            print(prompt)
            print('-' * 20)
            print("ASSISTANT:" + response)
            print('#' * 50)
            print()
            
        return response
    
    def step_1_identify_targets(self, video_frames, question, is_multi_choice=True):
        """
        Step 1: Task Definition and Target Identification
        
        Args:
            video_frames: Video frame tensors
            question: The question text
            is_multi_choice: Whether the question is multiple choice
            
        Returns:
            The identified targets in the video relevant to the question
        """
        if is_multi_choice:
            task_definition = "You are an expert in video analysis."
        else:
            task_definition = "You are an expert in video analysis."
        
        prompt = f"USER: <video>\n{task_definition}\n\nGiven the question: \"{question}\", what are the key objects, people, or elements in the video that need to be tracked to answer this question?\n\nProvide a concise list of the key targets.\nASSISTANT:"
        
        response = self._generate_response(video_frames, prompt, max_new_tokens=100)
        return response
    
    def step_2_object_description(self, video_frames, targets, question):
        """
        Step 2: Object Description (adapted from Object Tracking in the original paper)
        
        Args:
            video_frames: Video frame tensors
            targets: The identified targets from step 1
            question: The original question
            
        Returns:
            Description of the targets throughout the video
        """
        prompt = f"USER: <video>\nDescribe in detail the following elements that are relevant to answering the question \"{question}\":\n\n{targets}\n\nFocus on their appearance, movement, and interactions in the video.\nASSISTANT:"
        
        response = self._generate_response(video_frames, prompt, max_new_tokens=150)
        return response
    
    def step_3_action_analysis(self, video_frames, object_descriptions, question):
        """
        Step 3: Action Analysis
        
        Args:
            video_frames: Video frame tensors
            object_descriptions: The object descriptions from step 2
            question: The original question
            
        Returns:
            Analysis of actions and implications
        """
        prompt = f"USER: <video>\nBased on the question \"{question}\" and these observations:\n\n{object_descriptions}\n\nAnalyze what actions are occurring in the video, their sequence, and their implications. Include both direct observations and reasonable inferences.\nASSISTANT:"
        
        response = self._generate_response(video_frames, prompt, max_new_tokens=200)
        return response


    def _get_first_token_logits(self, scores):

        first_token_probs = torch.nn.functional.softmax(scores[0], dim=-1)
        
        # Get token IDs for numbers 1-4
        token_ids = [self.processor.tokenizer.convert_tokens_to_ids(str(i)) for i in [1,2,3,4]]
        
        # Create probability dictionary for each sample in batch
        prob_list = []
        for batch_idx in range(first_token_probs.shape[0]):
            probs = [
                first_token_probs[batch_idx, token_ids[i]].item()
                for i in range(4)
            ]
            prob_list.append(probs)
        
        logits_list = []
        for batch_idx in range(scores[0].shape[0]):
            logits = [
                scores[0][batch_idx, token_ids[i]].item()
                for i in range(4)
            ]
            logits_list.append(logits)
    
        return prob_list, logits_list

    def _generate_response_and_get_first_token_logits(self, video_frames, prompt, max_new_tokens=100):
        
        inputs = self.processor(
            text=prompt, 
            videos=video_frames, 
            return_tensors="pt", 
            max_length=4096
        ).to("cuda")
        
        # Use more controlled generation parameters
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            return_dict_in_generate=True, 
            output_scores=True,
            # do_sample=False,  # Use greedy decoding for more consistent outputs
            # temperature=0.1,   # Lower temperature for more focused responses
            # num_beams=1,
            # early_stopping=True,
            # pad_token_id=self.model.processor.tokenizer.pad_token_id,
            # eos_token_id=self.model.processor.tokenizer.eos_token_id
        )

        
        decoded = self.processor.batch_decode(
            outputs.sequences,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        prob_list, logits_list = self._get_first_token_logits(outputs.scores)

        response = decoded[0]
        if "ASSISTANT:" in response:
            response = response.split("ASSISTANT:")[-1].strip()
        
        # Extract just the assistant's response, removing the prompt
        if self.show_intermediate_steps:
            print('#' * 20 + ' Generate ' + '#' * 20)
            print(prompt)
            print('-' * 20)
            print("ASSISTANT:" + response)
            print('#' * 50)
            print()

        return response, prob_list, logits_list
            
    
    def step_4_answer_scoring(self, video_frames, question, choices, action_analysis):
        """
        Step 4: Answer Scoring and Ranking for multi-choice questions
        
        Args:
            video_frames: Video frame tensors
            question: The question text
            choices: List of answer choices
            action_analysis: The action analysis from step 3
            
        Returns:
            Final answer with scores
        """
        # First, score each choice individually
        scores_and_rationales = []
        
        for i, choice in enumerate(choices):
            prompt = f"USER: <video>\nQuestion: {question}\nCandidate answer: {choice}\n\nBased on the video and this analysis:\n{action_analysis}\n\nRate the likelihood of this answer being correct (1-10) and explain why.\nASSISTANT:"
            
            response = self._generate_response(video_frames, prompt, max_new_tokens=150)
            scores_and_rationales.append(response)
        
        # Now do the final ranking and selection
        prompt = f"USER: <video>\nFor the question: \"{question}\", here are the ratings for each answer choice:\n\n"
        
        for i, (choice, rationale) in enumerate(zip(choices, scores_and_rationales)):
            prompt += f"Option {i+1}: {choice}\nRating: {rationale}\n\n"
        
        prompt += "Based on these ratings, which answer is most likely correct and why? Give the final answer option index (respond with a single number only).\nASSISTANT:"
        
        ranking_output, prob_list, logits_list = self._generate_response_and_get_first_token_logits(video_frames, prompt, max_new_tokens=100)

        # Extract the final answer index assuming the model outputs only a single integer
        answer_index = None
        try:
            match = re.search(r"\b([1-9][0-9]*)\b", ranking_output.strip())
            if match:
                idx = int(match.group(1)) - 1  # Convert to 0-based index
                if 0 <= idx < len(choices):
                    answer_index = idx
                    final_answer = choices[idx]
                else:
                    final_answer = None
            else:
                final_answer = None
        except Exception as e:
            print(f"Parsing error: {e}")
            final_answer = None

        return final_answer, ranking_output, scores_and_rationales, prob_list, logits_list
    
    def step_5_answer_verification(self, video_frames, question, final_answer, action_analysis):
        """
        Step 5: Answer Verification
        
        Args:
            video_frames: Video frame tensors
            question: The question text
            final_answer: The final answer from step 4
            action_analysis: The action analysis from step 3
            
        Returns:
            Verification of the answer
        """
        prompt = f"USER: <video>\nQuestion: {question}\nSelected answer: {final_answer}\n\nBased on the video evidence and this analysis:\n{action_analysis}\n\nVerify whether this answer is correct. Provide a final verdict (correct/incorrect) with justification.\nASSISTANT:"
        
        response = self._generate_response(video_frames, prompt, max_new_tokens=150)

        get_first_token_logits(scores)
        
        return response
    
    def video_qa_reasoning(self, video_frames, question, choices=None, output_intermediate_steps=False, show_intermediate_steps=False):
        """
        Complete video QA reasoning process using the Video-of-Thought approach
        
        Args:
            video_frames: Video frame tensors
            question: The question text
            choices: List of answer choices
            output_intermediate_steps: Whether to output intermediate reasoning steps
            
        Returns:
            Final answer and optionally intermediate steps
        """

        start_time = perf_counter()

        self.show_intermediate_steps = show_intermediate_steps

        is_multi_choice = (choices is not None)

        if show_intermediate_steps:
            print("Step 1: Identifying targets...")
        targets = self.step_1_identify_targets(video_frames, question, is_multi_choice)

        if show_intermediate_steps:
            print("Step 2: Describing objects...")
        object_descriptions = self.step_2_object_description(video_frames, targets, question)

        if show_intermediate_steps:
            print("Step 3: Analyzing actions...")
        action_analysis = self.step_3_action_analysis(video_frames, object_descriptions, question)

        if show_intermediate_steps:
            print("Step 4: Scoring and ranking answers...")
        final_answer, ranking_response, scores, prob_list, logits_list = self.step_4_answer_scoring(
            video_frames, question, choices, action_analysis
        )

        # if show_intermediate_steps:
        #     print("Step 5: Verifying answer...")
        # verification = self.step_5_answer_verification(
        #     video_frames, question, final_answer, action_analysis
        # )
        
        # Format the final result
        if is_multi_choice:
            # Try to extract the answer index
            answer_number = 1  # default value if no answer get extracted
            
            answer_number_match = re.search(r'(\d+)', ranking_response)
            if answer_number_match:
                answer_number = answer_number_match.group(1)
            else:
                print(f"No answer is matched, set to default answer: {1}")

        end_time = perf_counter()
        
        if output_intermediate_steps:
            return {
                "targets": targets,
                "object_descriptions": object_descriptions,
                "action_analysis": action_analysis,
                "scores": scores,
                "answer_index": int(answer_number),
                "answer": final_answer,
                "probs": prob_list,  # confidence for each option choice from the final reasoning step
                "logits": logits_list,  # raw logits of each option choice from the final reasoning step
                "inference_time": (end_time - start_time),
                # "verification": verification,
                # "final_result": final_result
            }
        else:
            return final_answer
    
    def video_qa_direct(self, video_frames, question, choices=None, max_new_tokens=100):
        """
        Standard video QA without the step-by-step reasoning process
        
        Args:
            video_frames: Video frame tensors
            question: The question text
            choices: List of answer choices or None for open-ended questions
            max_new_tokens: Maximum number of tokens to generate
            
        Returns:
            Direct answer without step-by-step reasoning
        """
        if choices:
            # Format multiple-choice question
            choice_with_idx = [f'"{i+1}": {choice}\n' for i, choice in enumerate(choices)]
            prompt = f"USER: <video>\n {question} \n {choice_with_idx} Answer with the option's index from the given choices directly. \n ASSISTANT: "
        else:
            # Open-ended question
            prompt = f"USER: <video>\n {question} \n Answer directly based on what you see in the video. \n ASSISTANT: "
        
        response = self._generate_response(video_frames, prompt, max_new_tokens)
        return response



In [70]:
predictor = VideoOfThoughtPredictor(model, processor)
result = predictor.video_qa_reasoning(
    video_frames=video_frames,
    question=question,
    choices=choices,
    show_intermediate_steps=True,
    output_intermediate_steps=True  # Set to True to see all reasoning steps
)

Step 1: Identifying targets...
##################################################
USER: <video>
You are an expert in video analysis.

Given the question: "What will the person do next?", what are the key objects, people, or elements in the video that need to be tracked to answer this question?

Provide a concise list of the key targets.
ASSISTANT:
--------------------
ASSISTANT:The key targets to track are the woman's body language, the position of the door, and the presence of any other people or objects in the room.
##################################################

Step 2: Describing objects...
##################################################
USER: <video>
Describe in detail the following elements that are relevant to answering the question "What will the person do next?":

The key targets to track are the woman's body language, the position of the door, and the presence of any other people or objects in the room.

Focus on their appearance, movement, and interactions in the vide

In [71]:
result

{'targets': "The key targets to track are the woman's body language, the position of the door, and the presence of any other people or objects in the room.",
 'object_descriptions': "The woman in the video is standing in front of a door, and she is holding a remote control. She is wearing a blue hoodie and glasses. The door is open, and there is a white door frame visible. The woman is looking at the camera, and her body language suggests that she is about to interact with the camera or the remote control. The presence of the remote control indicates that she might be preparing to change the channel, adjust the volume, or perform some other action related to the device. The woman's actions and expressions will provide clues about what she will do next.",
 'action_analysis': 'The woman in the video is holding a remote control and is standing in front of a door. She is wearing a blue hoodie and glasses. The door is open, and there is a white door frame visible. The woman is looking at th

In [72]:
result['answer_index']

2