In [32]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import os
import pandas as pd

In [None]:
# def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
#     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
#     max_frame = len(vr) - 1
#     fps = float(vr.get_avg_fps())

#     pixel_values_list, num_patches_list = [], []
#     transform = build_transform(input_size=input_size)
#     frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
#     for frame_index in frame_indices:
#         img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
#         img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
#         pixel_values = [transform(tile) for tile in img]
#         pixel_values = torch.stack(pixel_values)
#         num_patches_list.append(pixel_values.shape[0])
#         pixel_values_list.append(pixel_values)
#     pixel_values = torch.cat(pixel_values_list)
#     return pixel_values, num_patches_list

In [3]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

In [16]:
!pip install -U huggingface_hub
!huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL2_5-8B --local-dir InternVL2_5-8B

Downloading 'config.json' to 'InternVL2_5-8B/.cache/huggingface/download/8_PA_wEVGiVa2goH2H4KQOQpvVY=.90be7fce58a2f69f59fb06159c8de0cfa5507888.incomplete'
Downloading 'added_tokens.json' to 'InternVL2_5-8B/.cache/huggingface/download/SeqzFlf9ZNZ3or_wZAOIdsM3Yxw=.35f5893c8e29d6102945a953529819a2d56c62a9.incomplete'

config.json: 4.05kB [00:00, 22.4MB/s]
Download complete. Moving file to InternVL2_5-8B/config.json

added_tokens.json: 100%|███████████████████████| 179/179 [00:00<00:00, 2.07MB/s]Downloading '.gitattributes' to 'InternVL2_5-8B/.cache/huggingface/download/wPaCkH-WbT7GsmxMKKrNZTV4nSM=.8146268512808cfd9623536c65bf0d0ef7efdc45.incomplete'

Download complete. Moving file to InternVL2_5-8B/added_tokens.json

.gitattributes: 1.63kB [00:00, 11.7MB/s]
Download complete. Moving file to InternVL2_5-8B/.gitattributes
Downloading 'configuration_internlm2.py' to 'InternVL2_5-8B/.cache/huggingface/download/pCb1JrLKfwPgD6-09aFLDP6Cnv0=.282b13b1e2066ecc074ecae87b35a19d251f0ed7.incomplete'
D

In [4]:
# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
base_dir = "/n/fs/visualai-scr/temp_LLP/ellie/slowfast_kinetics"
path = os.path.join(base_dir, "llm_experiments", "InternVL3-8B")
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

# set the max number of tiles in `max_num`
generation_config = dict(max_new_tokens=1024, do_sample=False)

FlashAttention2 is not installed.


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 27.29it/s]


In [5]:
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

In [6]:
def load_video(frame_dir, bound=None, input_size=448, max_num=1, num_segments=32):
    frame_paths = sorted([
        os.path.join(frame_dir, f)
        for f in os.listdir(frame_dir)
        if f.endswith(('.jpg', '.png'))
    ])
    max_frame = len(frame_paths) - 1
    fps = 30  # estimate manually if unknown

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)

    for idx in frame_indices:
        img = Image.open(frame_paths[idx]).convert('RGB')
        tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in tiles]
        pixel_values = torch.stack(pixel_values)
        pixel_values_list.append(pixel_values)
        num_patches_list.append(pixel_values.shape[0])

    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list

In [9]:
import matplotlib.pyplot as plt

In [34]:
base_dir = "/n/fs/visualai-scr/temp_LLP/ellie/slowfast_kinetics"
video_path = os.path.join(base_dir, "dataset", "action_swap", "original_action_swap", "sample_000000_deadlifting_to_juggling soccer ball_f8dp7wR4GWg_000003_000013_bg_MCV13GHst20_000002_000012")
print("Video Path = ", video_path)

# for index in range(1, 100, 10):
#     pil_img = Image.open(os.path.join(video_path, f"{index:06d}.jpg"))
#     plt.imshow(pil_img)
#     plt.axis('off')  # optional: hides axis ticks
#     plt.show()

Video Path =  /n/fs/visualai-scr/temp_LLP/ellie/slowfast_kinetics/dataset/action_swap/original_action_swap/sample_000000_deadlifting_to_juggling soccer ball_f8dp7wR4GWg_000003_000013_bg_MCV13GHst20_000002_000012


In [48]:
csv_path = os.path.join(base_dir, "dataset/action_swap/action_swap_mcq.csv")
df = pd.read_csv(csv_path)

In [56]:
from tqdm import tqdm

In [57]:
pred_human = 0
pred_bg = 0
total = 0

mapping = {
    1:'A',
    2:'B',
    3:'C',
    4:'D',
    5:'E'
}

rows = []

for idx, row in tqdm(df.iterrows()):
    action_swap_path = row['action_swap_path']
    label_A = row['label_A']
    label_B = row['label_B']
    human_choice = row['human_choice']
    bg_choice = row['background_choice']
    choice_1 = row['choice_1']
    choice_2 = row['choice_2']
    choice_3 = row['choice_3']
    choice_4 = row['choice_4']
    choice_5 = row['choice_5']

    
    pixel_values, num_patches_list = load_video(frame_dir=action_swap_path, num_segments=8, max_num=1)
    pixel_values = pixel_values.to(torch.bfloat16).cuda()
    video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
    question = video_prefix + f'What is the action being performed? A) {choice_1} B) {choice_2} C) {choice_3} D) {choice_4} E) {choice_5}'

    response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=None, return_history=True)
    
    # print(f'User: {question}\nAssistant: {response}')

    # print("Correct Answer:", mapping[human_choice])

    choice = response[0]
    if choice == mapping[human_choice]:
        pred_human += 1
        # print("Predicted human")

    elif choice == mapping[bg_choice]:
        pred_bg += 1
        # print("Predicted background")
    total += 1

    new_row = row.copy()
    new_row['choice'] = choice
    new_row['choice_is_human'] = (choice==mapping[human_choice])
    new_row['choice_is_bg'] = (choice==mapping[bg_choice])
    rows.append(new_row)

print(f"Predicted Human: {pred_human}/{total} = {pred_human/total:.2f}")
print(f"Predicted Background: {pred_bg}/{total} = {pred_bg/total:.2f}")

df_result = pd.DataFrame(rows)
df_result.to_csv(os.path.join(base_dir, "llm_experiments/internvl3_results.csv"), index=False)

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
1it [00:00,  1.07it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
2it [00:01,  1.13it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
3it [00:02,  1.12it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
4it [00:03,  1.11it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
5it [00:04,  1.19it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
6it [00:05,  1.21it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
7it [00:05,  1.23it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
8it [00:06,  1.21it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
9it [00:07,  1.17it/s]


KeyboardInterrupt: 

In [30]:
pixel_values, num_patches_list = load_video(frame_dir=video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question1 = video_prefix + 'What is the action being performed? A) playing basketball B) juggling soccer ball C) deadlifting D) opening bottle E) climbing a rope'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question1, generation_config,
                               num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question1}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: Frame1: <image>
Frame2: <image>
Frame3: <image>
Frame4: <image>
Frame5: <image>
Frame6: <image>
Frame7: <image>
Frame8: <image>
What is the action being performed? A) playing basketball B) juggling soccer ball C) deadlifting D) opening bottle E) climbing a rope
Assistant: B) juggling soccer ball


In [None]:
    target_names = ['playing guitar', 'bowling', 'playing saxophone', 'brushing teeth', 
                    'playing basketball', 'tying tie', 'skiing slalom', 'brushing hair', 
                    'punching person (boxing)', 'playing accordion', 'archery', 
                    'catching or throwing frisbee', 'drinking', 'reading book', 
                    'eating ice cream', 'flying kite', 'sweeping floor', 
                    'walking the dog', 'skipping rope', 'clean and jerk', 
                    'eating cake', 'catching or throwing baseball', 
                    'skiing (not slalom or crosscountry)', 'juggling soccer ball', 
                    'deadlifting', 'driving car', 'cleaning windows', 'shooting basketball', 
                    'canoeing or kayaking', 'surfing water', 'playing volleyball', 'opening bottle', 
                    'playing piano', 'writing', 'dribbling basketball', 'reading newspaper', 'playing violin', 
                    'juggling balls', 'playing trumpet', 'smoking', 'shooting goal (soccer)', 'hitting baseball', 
                    'sword fighting', 'climbing ladder', 'playing bass guitar', 'playing tennis', 'climbing a rope', 
                    'golf driving', 'hurdling', 'dunking basketball']