## Setup

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
import numpy as np
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import json


# ─── Model Setup 38B 8 bit quant ─────────────────────────────────────────────────
MODEL_PATH = 'pretrained/InternVL3-14B'
model = AutoModel.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    load_in_8bit=True,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00,  1.01it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Video

In [None]:
# ─── Variables ───────────────────────────────────────────────────
num_frames = 32
temp = 0.1
# ─── Prompt ─────────────────────────────────────────────────
prompt_string = """
Be an objective Visual Evidence Analyst. Report on visual/audible data only. 
Use this template.

Summary:
- Overview and chronological events.

- Military/Police:
  - Appearance: Uniform colors, is there a visable insignia, gear.
  - Weapons: Firearm or sticks.
  - Force: All instances of physical force.
  - Interaction: Commands, arrests, aid etc.
- Civilians:
  - Actions: Protesting, fleeing, throwing objects, etc.
  - Condition: Visible injuries, distress, on ground.

- Vehicles:
  - Type: Civilian, police, military.
  - Markings: Transcribe & translate.
- Written Materials:
  - Content: Banners, signs, graffiti.
  - Translation: Quote & translate.
- Conditions:
  - Environment: Puddles on the ground indicating rain. 
"""

# ─── Constants ───────────────────────────────────────────────────

GEN_CONFIG = dict(max_new_tokens=1024, do_sample=True, temperature=temp)
INPUT_SIZE = 448
MEAN = (0.485, 0.456, 0.406)
STD = (0.229, 0.224, 0.225)
MAX_PATCHES  = 12
OUTPUT_JSON = "video_descriptions.json"

## Helpers

In [None]:
# ─── Video  Process ──────────────────────────────────
def transform_video():
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB')),
        T.Resize((INPUT_SIZE, INPUT_SIZE), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])

def get_frame_indices(num_frames, total):
    return np.linspace(0, total - 1, num_frames, dtype=int)

def load_video(video_path, num_frames):
    vr = VideoReader(video_path, ctx=cpu(0))
    transform = transform_video()
    indices = get_frame_indices(num_frames, len(vr))
    video_tensor = [transform(Image.fromarray(vr[i].asnumpy())) for i in indices]
    return torch.stack(video_tensor)  # [num_frames, 3, H, W]

def infer_video(video_path):
    try:
        video_tensor = load_video(video_path, num_frames).to(torch.bfloat16).cuda()
        video_tensor = video_tensor.contiguous()

        prompt = ''.join([f'Frame{i+1}: <image>\n' for i in range(num_frames)])
        prompt += prompt_string

        response, _ = model.chat(
            tokenizer,
            video_tensor,
            prompt,
            GEN_CONFIG,
            history=None,
            return_history=True
        )
        return response
    except Exception as e:
        return f"Error: {str(e)}"



# ─── Image Preprocessing ─────────────────────────────────────────────────
def transform_img():
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((INPUT_SIZE, INPUT_SIZE), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD),
    ])

def dynamic_preprocess(image, image_size=448, max_num=12):
    orig_w, orig_h = image.size
    aspect = orig_w / orig_h

    # find best grid (i × j) closest to aspect, with i*j ≤ max_num
    best, best_diff = (1,1), float('inf')
    for i in range(1, max_num+1):
        for j in range(1, max_num+1):
            if i*j > max_num: continue
            diff = abs(aspect - (i/j))
            if diff < best_diff:
                best, best_diff = (i,j), diff

    gw, gh = best
    new_w, new_h = image_size * gw, image_size * gh
    image = image.resize((new_w, new_h))

    tiles = []
    for y in range(gh):
        for x in range(gw):
            box = (x*image_size, y*image_size, (x+1)*image_size, (y+1)*image_size)
            tiles.append(image.crop(box))
    return tiles

def load_image(path, image_size=448, max_num=12):
    img = Image.open(path).convert('RGB')
    tiles = dynamic_preprocess(img, image_size=image_size, max_num=max_num)
    tfm = transform_img()
    return torch.stack([tfm(t) for t in tiles])

# ─── Inference ────────────────────────────────────────────────────────────
def infer_img(image_path):
    try:
        pixel_values = load_image(image_path, max_num=MAX_PATCHES).to(torch.bfloat16).cuda()
        pixel_values = pixel_values.contiguous()


        question = f"<image>\n{prompt_string}"
        response, _ = model.chat(tokenizer, 
                                pixel_values, 
                                question, 
                                GEN_CONFIG, 
                                history=None, 
                                return_history=True)

        return response
    
    except Exception as e:
        return f"Error: {str(e)}"

# ─── Batch Loop ──────────────────────────────────────────────
def batch_infer(video_dir):
    results = {}
    for filename in sorted(os.listdir(video_dir)):
        if filename.lower().endswith(".mp4"):
            path = os.path.join(video_dir, filename)
            description = infer_video(path)
            results[filename] = description

        elif filename.lower().endswith('.jpg'):
            path = os.path.join(video_dir, filename)
            description = infer_img(path)
            results[filename] = description
    return results

# ─── Main ───────────────────────────────────────────────────
if __name__ == "__main__":
    results = batch_infer('toy_ds')
    with open(OUTPUT_JSON, "w") as f:
        json.dump(results, f, indent=2)

Token indices sequence length is longer than the specified maximum sequence length for this model (8628 > 8192). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_

## Multiple GPUs

In [4]:
# import math
# from transformers import  AutoConfig

# # ─── Split Model Across GPUs ──────────────────────────────────────────────
# def split_model(model_name):
#     device_map = {}
#     world_size = torch.cuda.device_count()
#     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
#     num_layers = config.llm_config.num_hidden_layers

#     # Distribute layers evenly, but give half of GPU0 to the vision part
#     num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
#     num_layers_per_gpu = [num_layers_per_gpu] * world_size
#     num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)

#     layer_cnt = 0
#     for gpu_idx, count in enumerate(num_layers_per_gpu):
#         for _ in range(count):
#             device_map[f'language_model.model.layers.{layer_cnt}'] = gpu_idx
#             layer_cnt += 1

#     # Pin all vision & shared embeddings to GPU0
#     vision_keys = [
#         'vision_model',
#         'mlp1',
#         'language_model.model.tok_embeddings',
#         'language_model.model.embed_tokens',
#         'language_model.output',
#         'language_model.model.norm',
#         'language_model.model.rotary_emb',
#         'language_model.lm_head',
#         f'language_model.model.layers.{num_layers - 1}'
#     ]
#     for key in vision_keys:
#         device_map[key] = 0

#     return device_map

# # ─── Inference ────────────────────────────────────────────────────────────
# def infer(image_path):
#     device_map = split_model(MODEL_PATH)
#     model     = AutoModel.from_pretrained(
#                     MODEL_PATH,
#                     torch_dtype=torch.bfloat16,
#                     low_cpu_mem_usage=True,
#                     use_flash_attn=True,
#                     trust_remote_code=True,
#                     device_map=device_map
#                 ).eval()
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, use_fast=False)


## Context window

In [None]:
"""
Chunk-by-chunk video description with context-passing
====================================================

▪ Uses the existing helpers you pasted (`build_transform`, `dynamic_preprocess`,
  `get_index`, etc.).
▪ Passes **only the previous chunk’s summary** to the next chunk, so the prompt
  length stays fixed.
▪ Saves all chunk summaries to a single JSON file (`video_descriptions.json`).

Drop this *below* your model / tokenizer setup. Nothing else to import.
"""

import json
import math
import traceback
from pathlib import Path

import numpy as np
import torch
from decord import VideoReader, cpu
from PIL import Image

# ─── Runtime parameters ────────────────────────────────────────────────
NUM_FRAMES      = 32          # frames sampled per chunk
CLIP_DUR   = 120         # length of each chunk (≈2 min)
TEMPERATURE     = 0.1
INPUT_SIZE      = 448
OUTPUT_JSON     = "video_descriptions.json"
INPUT_SIZE = 448
MEAN = (0.485, 0.456, 0.406)
STD = (0.229, 0.224, 0.225)
MAX_PATCHES  = 12

first_prompt = """
Be an objective Visual Evidence Analyst. Report on visual data only.

Summary:
- Overview and chronological events.

- Military/Police:
  - Appearance: Uniform colors, is there a visable insignia, gear.
  - Weapons: Firearm or sticks.
  - Force: All instances of physical force.
  - Interaction: Commands, arrests, aid etc.

- Civilians:
  - Actions: Protesting, fleeing, throwing objects, etc.
  - Condition: Visible injuries, distress, on ground.

- Vehicles:
  - Type: Civilian, police, military.
  - Markings: Transcribe & translate.

- Written Materials:
  - Content: Banners, signs, graffiti.
  - Translation: Quote & translate.

- Conditions:
  - Environment: Puddles on the ground indicating rain.
"""

second_prompt = """
Continue acting as an objective Visual Evidence Analyst. Your task is to produce a single, updated cumulative report.



**Instructions:**

1.  Take the analysis from the previous context as your starting point.
2.  Integrate any new visual information from the current clip into the appropriate categories below.
3.  Carry forward all previously seen details, even if they are not visible in the current clip. 
    The final output must be a complete record of everything seen so far.
4.  Do not create a separate "new observations" section.

- Overview and chronological events
- Military/Police
  - Appearance
  - Weapons
  - Force
  - Interaction
- Civilians
  - Actions
  - Condition
- Vehicles
  - Type
  - Markings
- Written Materials
  - Content
  - Translation
- Conditions
  - Puddles suggesting it might have rained
"""

GEN_CONFIG = dict(max_new_tokens=1024, do_sample=True, temperature=TEMPERATURE)

# ─── Build the transform once (uses the helper you supplied) ────────────
def transform_video(img):
    tfm = T.Compose([
        T.Lambda(lambda x: x.convert('RGB')),
        T.Resize((INPUT_SIZE, INPUT_SIZE), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return tfm(img)

# ─── Frame sampling + decoding ──────────────────────────────────────────
def get_index(clip_time_range, fps, max_frame, first_idx=0, num_segments=32):
    if clip_time_range:
        start, end = clip_time_range[0], clip_time_range[1]
    else: # If no time range is specified will take entire video 
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

def dynamic_preprocess(image, image_size=448, max_num=12):
    orig_w, orig_h = image.size
    aspect = orig_w / orig_h

    # find best grid (i × j) closest to aspect, with i*j ≤ max_num
    best, best_diff = (1,1), float('inf')
    for i in range(1, max_num+1):
        for j in range(1, max_num+1):
            if i*j > max_num: continue
            diff = abs(aspect - (i/j))
            if diff < best_diff:
                best, best_diff = (i,j), diff

    gw, gh = best
    new_w, new_h = image_size * gw, image_size * gh
    image = image.resize((new_w, new_h))

    tiles = []
    for y in range(gh):
        for x in range(gw):
            box = (x*image_size, y*image_size, (x+1)*image_size, (y+1)*image_size)
            tiles.append(image.crop(box))
    return tiles

def load_video_frames(video_path, start_s, end_s, num_segments=NUM_FRAMES):
    """Return a tensor `(T,C,H,W)` and the frame indices used."""
    # Get video properties 
    vr   = VideoReader(video_path, ctx=(cpu(0)))
    fps  = vr.get_avg_fps()
    last = len(vr) - 1

    # Get index of frames 
    idxs = get_index([start_s, end_s], fps, last, num_segments=num_segments)
    if len(idxs) == 0:
        return torch.empty(0), idxs
    
    # Create a batch of frames for inference 
    frames = vr.get_batch(idxs.tolist())       # decord.NDArray
    frames = frames.asnumpy()       
    tensors = []
    for fr_np in frames:
        img   = Image.fromarray(np.asarray(fr_np)).convert("RGB")
        tiles = dynamic_preprocess(img, 
                                   image_size=INPUT_SIZE,
                                   max_num=1)
        tensors.append(transform_video(tiles[0]))
    return torch.stack(tensors), idxs


def clip_as_prompt(n):
    """Generate ‘Frame1: <image>…’ placeholders for the prompt."""
    return "\n".join(f"Frame{i+1}: <image>" for i in range(n))


# ─── Main loop ─────────────────────────────────────────────────────────
def describe_video(video_path, clip_dur=CLIP_DUR):
    results, prev_output = [], ""
    video_path = Path(video_path)

    try:
        meta_vr  = VideoReader(str(video_path), ctx=cpu(0))
        fps      = meta_vr.get_avg_fps()
        duration_float = len(meta_vr) / fps
    except Exception:
        print("❌ Failed to open video:", traceback.format_exc())
        return []

    print(f"▶ Processing {video_path.name} ({duration_float:.1f}s)")

    duration_int = int(duration_float + 1e-6) # Convert to int and round up

    for start in range(0, duration_int, clip_dur):
        end = min(start + clip_dur, duration_int)
        print(f" Segment {start:>5.1f}s – {end:>6.1f}s")

        # Load frames
        vid_t, frame_indices = load_video_frames(str(video_path), start, end)
        
        # Convert to list and skip if empty 
        if isinstance(frame_indices, np.ndarray):
            frame_indices = frame_indices.flatten().tolist()
        if vid_t.numel() == 0:
            continue
        
        # Shape tensor and move to GPU
        vid_t = vid_t.to(torch.bfloat16).cuda().contiguous()  # (1,T,C,H,W)
        
        # Build prompt
        if start == 0:
          prompt = first_prompt
        
        else:
          context = f"The previous clip showed: {prev_output}\n\n" if prev_output else ""
          prompt   = (
              f"This is what you found in a previous part of this video:\n{context}"
              f"{clip_as_prompt(len(frame_indices))}\n"
              f"{second_prompt}"
          )

        response, _ = model.chat(
            tokenizer, vid_t, prompt, GEN_CONFIG,
            history=None, return_history=True
        )

        prev_output = response

        results.append({"start_sec": start, 
                        "end_sec": end, 
                        "description": response})
        
        del vid_t
        torch.cuda.empty_cache()

    # ── Save JSON ────────────────────────────────────────────────────
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    print(f"✔ All chunk summaries saved to {OUTPUT_JSON}")

    return results

# ─── main ───────────────────────────────────────────────────────────
if __name__ == "__main__":
    describe_video("toy_ds/TNS_0001_V.mp4")

▶ Processing TNS_0001_V.mp4 (1167.7s)
 Segment   0.0s –  120.0s


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
