## Setup

In [None]:
# # Install the packages
# %pip install decord
# %pip install bitsandbytes==0.46.0
# %pip install flash-attn==2.8.0.post2 --no-build-isolation

In [None]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

# toy_ds = '/content/drive/My Drive/datasets/InternVL/toy_ds'
# MODEL_PATH = '/content/My Drive/InterVL/InternVL3-78B'

## Video

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import math
import torch
import numpy as np
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoConfig

# ─── Prompt ─────────────────────────────────────────────────
prompt_string = """
Please write one detailed but concise paragraph describing the following, based on the video frames or images above:
- The overall scene and weather conditions (e.g., rain, puddles).
- Any visible weapons or use of force:
  • Guns (how many and where)
  • Are there people fighting, throwing punches, kicking, swinging sticks or bats?
  • Are people in uniform raising their arms? 
- Vehicles in the scene:
  • Presence of military vehicles (specify type)
  • License plates or markings (quote exact text)
- Human activity:
  • Actions of uniformed people (include uniform colors)
  • Actions of none-uniform people
  • Whether anyone appears to be injured, hurt, or lying on the ground
"""


# ─── Model Setup 38B 8 bit quant ─────────────────────────────────────────────────
MODEL_PATH = 'pretrained/InternVL3-38B'
model = AutoModel.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

# ─── Constants ───────────────────────────────────────────────────
INPUT_SIZE = 448
NUM_FRAMES = 32
GEN_CONFIG = dict(max_new_tokens=1024, do_sample=True, temprature=0.2)
MEAN = (0.485, 0.456, 0.406)
STD = (0.229, 0.224, 0.225)

# ─── Transforms ─────────────────────────────────────────────────
def build_transform():
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB')),
        T.Resize((INPUT_SIZE, INPUT_SIZE), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])

# ─── Load and Preprocess Video ──────────────────────────────────
def get_frame_indices(num_frames, total):
    return np.linspace(0, total - 1, num_frames, dtype=int)

def load_video(video_path, num_frames):
    vr = VideoReader(video_path, ctx=cpu(0))
    transform = build_transform()
    indices = get_frame_indices(num_frames, len(vr))
    pixel_values = [transform(Image.fromarray(vr[i].asnumpy())) for i in indices]
    return torch.stack(pixel_values)  # [num_frames, 3, H, W]

# ─── Inference ──────────────────────────────────────────────────
def infer(video_path):
    video_tensor = load_video(video_path, NUM_FRAMES).to(torch.float16).cuda()
    video_tensor = video_tensor.contiguous()

    prompt = ''.join([f'Frame{i+1}: <image>\n' for i in range(NUM_FRAMES)])
    prompt += prompt_string

    response, _ = model.chat(
        tokenizer,
        video_tensor,
        prompt,
        GEN_CONFIG,
        history=None,
        return_history=True
    )
    print("User:", prompt)
    print("Assistant:", response)

# ─── Entry ──────────────────────────────────────────────────────
if __name__ == "__main__":
    infer("toy_ds/videos/TNS_0169_V.mp4")

## Image

In [None]:
import math
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoConfig

# ─── Prompt ─────────────────────────────────────────────────
prompt_string = "Provide a detailed description of each image. Describe the foreground and background separately. Mention any people, objects, and actions clearly. What are the people doing? What expressions or activities are visible? What is the setting or context? Is there violence happening?"

# ─── Config ───────────────────────────────────────────────────────────────
MODEL_PATH = "./pretrained/InternVL3-78B"
INPUT_SIZE   = 448
MAX_PATCHES  = 12
GEN_CONFIG   = dict(max_new_tokens=1024, do_sample=True)
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

# ─── Split Model Across GPUs ──────────────────────────────────────────────
def split_model(model_name):
    device_map = {}
    world_size = torch.cuda.device_count()
    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    num_layers = config.llm_config.num_hidden_layers

    # Distribute layers evenly, but give half of GPU0 to the vision part
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)

    layer_cnt = 0
    for gpu_idx, count in enumerate(num_layers_per_gpu):
        for _ in range(count):
            device_map[f'language_model.model.layers.{layer_cnt}'] = gpu_idx
            layer_cnt += 1

    # Pin all vision & shared embeddings to GPU0
    vision_keys = [
        'vision_model',
        'mlp1',
        'language_model.model.tok_embeddings',
        'language_model.model.embed_tokens',
        'language_model.output',
        'language_model.model.norm',
        'language_model.model.rotary_emb',
        'language_model.lm_head',
        f'language_model.model.layers.{num_layers - 1}'
    ]
    for key in vision_keys:
        device_map[key] = 0

    return device_map

# ─── Image Preprocessing ─────────────────────────────────────────────────
def build_transform():
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((INPUT_SIZE, INPUT_SIZE), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ])

def dynamic_preprocess(image, image_size=448, max_num=12):
    orig_w, orig_h = image.size
    aspect = orig_w / orig_h

    # find best grid (i × j) closest to aspect, with i*j ≤ max_num
    best, best_diff = (1,1), float('inf')
    for i in range(1, max_num+1):
        for j in range(1, max_num+1):
            if i*j > max_num: continue
            diff = abs(aspect - (i/j))
            if diff < best_diff:
                best, best_diff = (i,j), diff

    gw, gh = best
    new_w, new_h = image_size * gw, image_size * gh
    image = image.resize((new_w, new_h))

    tiles = []
    for y in range(gh):
        for x in range(gw):
            box = (x*image_size, y*image_size, (x+1)*image_size, (y+1)*image_size)
            tiles.append(image.crop(box))
    return tiles

def load_image(path, image_size=448, max_num=12):
    img = Image.open(path).convert('RGB')
    tiles = dynamic_preprocess(img, image_size=image_size, max_num=max_num)
    tfm = build_transform()
    return torch.stack([tfm(t) for t in tiles])

# ─── Inference ────────────────────────────────────────────────────────────
def infer(image_path):
    device_map = split_model(MODEL_PATH)
    model     = AutoModel.from_pretrained(
                    MODEL_PATH,
                    torch_dtype=torch.bfloat16,
                    low_cpu_mem_usage=True,
                    use_flash_attn=True,
                    trust_remote_code=True,
                    device_map=device_map
                ).eval()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, use_fast=False)

    pixel_values = load_image(image_path, max_num=MAX_PATCHES)
    pixel_values = pixel_values.to(torch.bfloat16).cuda()

    question = f"<image>\n{prompt_string}"
    response, _ = model.chat(tokenizer, pixel_values, question, GEN_CONFIG, history=None, return_history=True)

    print("User:", question)
    print("Assistant:", response)

if __name__ == "__main__":
    infer("toy_ds/images/TNS_3773_I.jpg")

In [None]:
import os
import torch
import json
import numpy as np
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

# ─── Prompt ─────────────────────────────────────────────────
prompt_string = """
Please write one detailed but concise paragraph describing the following, based on the video frames or images above:
- The overall scene and weather conditions (e.g., rain, puddles).
- Any visible weapons or use of force:
  • Guns (how many and where)
  • Non-lethal force (e.g., hitting with fists, stones, or sticks)
- Vehicles in the scene:
  • Presence of military vehicles (specify type)
  • License plates or markings (quote exact text)
- Human activity:
  • Actions of uniformed personnel (include uniform colors)
  • Actions of protestors
  • Whether anyone appears to be injured, hurt, or lying on the ground
"""

# ─── Config ─────────────────────────────────────────────────
TEMP = 0.2
INPUT_SIZE = 448
NUM_FRAMES = 28
GEN_CONFIG = dict(max_new_tokens=1024, do_sample=True, temperature=TEMP)
MEAN = (0.485, 0.456, 0.406)
STD = (0.229, 0.224, 0.225)
MODEL_PATH = "OpenGVLab/InternVL3-78B"  # Update if needed
VIDEO_DIR = "/content/drive/MyDrive/InternVL/toy_ds/videos"  # Update as needed
OUTPUT_JSON = "video_descriptions.json"

# ─── Load Model ──────────────────────────────────────────────
model = AutoModel.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    load_in_8bit=True,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True
).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

# ─── Transforms ──────────────────────────────────────────────
def build_transform():
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB')),
        T.Resize((INPUT_SIZE, INPUT_SIZE), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])

# ─── Frame Selection ─────────────────────────────────────────
def get_frame_indices(num_frames, total):
    return np.linspace(0, total - 1, num_frames, dtype=int)

def load_video(video_path, num_frames):
    vr = VideoReader(video_path, ctx=cpu(0))
    transform = build_transform()
    indices = get_frame_indices(num_frames, len(vr))
    pixel_values = [transform(Image.fromarray(vr[i].asnumpy())) for i in indices]
    return torch.stack(pixel_values)  # [num_frames, 3, H, W]

# ─── Inference ───────────────────────────────────────────────
def infer_video(video_path):
    try:
        video_tensor = load_video(video_path, NUM_FRAMES).to(torch.bfloat16).cuda()
        prompt = ''.join([f'Frame{i+1}: <image>\n' for i in range(NUM_FRAMES)])
        prompt += prompt_string

        response, _ = model.chat(
            tokenizer,
            video_tensor,
            prompt,
            GEN_CONFIG,
            history=None,
            return_history=True
        )
        return response
    except Exception as e:
        return f"Error: {str(e)}"

# ─── Batch Loop ──────────────────────────────────────────────
def batch_infer(video_dir):
    results = {}
    for filename in sorted(os.listdir(video_dir)):
        if filename.lower().endswith(".mp4"):
            path = os.path.join(video_dir, filename)
            description = infer_video(path)
            results[filename] = description
    return results

# ─── Entry ───────────────────────────────────────────────────
if __name__ == "__main__":
    results = batch_infer(VIDEO_DIR)
    with open(OUTPUT_JSON, "w") as f:
        json.dump(results, f, indent=2)