In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import os, math, torch, cv2, wandb
import numpy as np
from PIL import Image
from torch import nn
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoProcessor,
    SiglipVisionModel
)

# ================= CONFIGURATION =================
WANDB_ENTITY  = "eren23"
WANDB_PROJECT = "blipren-video-synthetic"
RUN_NAME      = "whole-glitter-6"

# Model Config
LLM_NAME    = "meta-llama/Llama-3.2-1B"
VISION_NAME = "google/siglip-so400m-patch14-384"
NUM_FRAMES  = 4
IMG_SIZE    = 224

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# ================= 1. CONNECT & DOWNLOAD (FIXED) =================

print(f"Connecting to WandB Run: {RUN_NAME}...")
api = wandb.Api()
runs = api.runs(f"{WANDB_ENTITY}/{WANDB_PROJECT}", filters={"display_name": RUN_NAME})

if not runs:
    raise ValueError(f"Run '{RUN_NAME}' not found!")
run = runs[0]
print(f"Found Run ID: {run.id}")

# DEBUG: List files
print("Scanning files...")
available_files = [f.name for f in run.files()]

# FIX: Look for files in the 'video_checkpoints' folder
qformer_remote_path = "video_checkpoints/qformer_best.pt"
projector_remote_path = "video_checkpoints/projector_best.pt"

# If not in folder, check root (backward compatibility)
if qformer_remote_path not in available_files:
    if "qformer_best.pt" in available_files:
        qformer_remote_path = "qformer_best.pt"
        projector_remote_path = "projector_best.pt"
    else:
        print(f"Available: {available_files}")
        raise FileNotFoundError("Could not find qformer_best.pt in WandB files!")

print(f"Downloading {qformer_remote_path}...")
run.file(qformer_remote_path).download(root=".", replace=True)

print(f"Downloading {projector_remote_path}...")
run.file(projector_remote_path).download(root=".", replace=True)

print("Weights Downloaded.")

# ================= 2. DEFINE ARCHITECTURE =================

def get_sinusoidal_embeddings(n_pos, d_model):
    pe = torch.zeros(n_pos, d_model)
    position = torch.arange(0, n_pos, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe.unsqueeze(0).unsqueeze(2)

class QFormer(nn.Module):
    def __init__(self, d_vis, d_model, n_queries=64, n_layers=6):
        super().__init__()
        self.query = nn.Parameter(torch.randn(1, n_queries, d_model))
        self.vis_proj = nn.Linear(d_vis, d_model)
        layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=8, dim_feedforward=d_model*4, batch_first=True)
        self.transformer = nn.TransformerDecoder(layer, num_layers=n_layers)
        self.ln_out = nn.LayerNorm(d_model)

    def forward(self, vis_features):
        B = vis_features.shape[0]
        v = self.vis_proj(vis_features)
        q = self.query.expand(B, -1, -1)
        return self.ln_out(self.transformer(q, v))

class Projector(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.linear1 = nn.Linear(d_model, d_model)
        self.gelu = nn.GELU()
        self.linear2 = nn.Linear(d_model, d_model)
        self.ln2 = nn.LayerNorm(d_model)
    def forward(self, x):
        res = x
        x = self.ln1(x)
        x = self.linear2(self.gelu(self.linear1(x)))
        return self.ln2(x + res)

class VideoBLIP(nn.Module):
    def __init__(self, llm, vision, qformer, projector, num_frames):
        super().__init__()
        self.llm = llm
        self.vision = vision
        self.qformer = qformer
        self.projector = projector
        self.num_frames = num_frames

        d_vis = vision.config.hidden_size
        sinusoidal_embed = get_sinusoidal_embeddings(num_frames, d_vis)
        self.register_buffer("time_embed", sinusoidal_embed)

    def encode_video(self, pixel_values):
        B, T, C, H, W = pixel_values.shape
        pixel_values_flat = pixel_values.view(B * T, C, H, W)
        with torch.no_grad():
            vout = self.vision(pixel_values=pixel_values_flat)
            vtoks = vout.last_hidden_state
        d_vis = vtoks.shape[-1]
        vtoks = vtoks.view(B, T, -1, d_vis)

        t_embed = self.time_embed[:, :T, :, :]
        vtoks = vtoks + t_embed.to(vtoks.dtype)

        vtoks = vtoks.view(B, -1, d_vis)
        q = self.qformer(vtoks.to(torch.float32))
        return self.projector(q).to(self.llm.dtype)

    @torch.no_grad()
    def generate(self, pixel_values, prompts, max_new_tokens=30, repetition_penalty=1.2):
        q = self.encode_video(pixel_values)
        K = q.size(1)
        enc = tokenizer(prompts, return_tensors="pt", padding=True).to(pixel_values.device)
        input_ids = enc.input_ids
        attn_mask = enc.attention_mask
        txt_emb = self.llm.get_input_embeddings()(input_ids)
        all_emb = torch.cat([q, txt_emb], dim=1)
        prefix_mask = torch.ones(input_ids.size(0), K, device=input_ids.device, dtype=attn_mask.dtype)
        full_mask = torch.cat([prefix_mask, attn_mask], dim=1)

        out_ids = self.llm.generate(
            inputs_embeds=all_emb, attention_mask=full_mask, max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id, repetition_penalty=repetition_penalty, do_sample=False
        )
        return tokenizer.batch_decode(out_ids, skip_special_tokens=True)

# ================= 3. LOAD MODELS & RUN =================

print("Loading Backbones...")
tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
processor = AutoProcessor.from_pretrained(VISION_NAME)
vision_model = SiglipVisionModel.from_pretrained(VISION_NAME, torch_dtype=torch.float16).to(device)
llm = AutoModelForCausalLM.from_pretrained(LLM_NAME, torch_dtype=torch.float16).to(device)

d_model = llm.config.hidden_size
d_vision = vision_model.config.hidden_size

# Init Adapters
qformer = QFormer(d_vis=d_vision, d_model=d_model, n_queries=64, n_layers=6)
projector = Projector(d_model)

# Load State Dicts from the path we downloaded to
print("Loading Weights...")
# Note: wandb download preserves folder structure if it was in a folder
qformer.load_state_dict(torch.load(qformer_remote_path, map_location=device))
projector.load_state_dict(torch.load(projector_remote_path, map_location=device))

# Build Final Model
model = VideoBLIP(llm, vision_model, qformer, projector, num_frames=NUM_FRAMES).to(device)
model.eval()

# ================= 4. GENERATE VIDEO & PREDICT =================

def create_inference_video(sequence):
    frames = []
    for num in sequence:
        img_np = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)
        text = str(num)
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 6
        thickness = 10
        text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
        text_x = (IMG_SIZE - text_size[0]) // 2
        text_y = (IMG_SIZE + text_size[1]) // 2
        cv2.putText(img_np, text, (text_x, text_y), font, font_scale, (255, 255, 255), thickness)
        frames.append(Image.fromarray(img_np))
    return frames

# TEST CASE
TEST_SEQ = [8, 1, 5, 2]
print(f"\n🎥 Generating Video for sequence: {TEST_SEQ}")

frames = create_inference_video(TEST_SEQ)
prompt = "Numbers in video:"

inputs = processor(images=frames, return_tensors="pt")
pixel_values = inputs.pixel_values.to(device, dtype=torch.float16)
pixel_values = pixel_values.unsqueeze(0)

print("Running Inference...")
generated_text = model.generate(
    pixel_values,
    [prompt],
    max_new_tokens=20,
    repetition_penalty=1.2
)

result = generated_text[0].replace(prompt, "").strip()

print("-" * 30)
print(f"Prompt: {prompt}")
print(f"Truth : {', '.join(map(str, TEST_SEQ))}")
print(f"Model : {result}")
print("-" * 30)

Device: cuda
📥 Connecting to WandB Run: whole-glitter-6...
✅ Found Run ID: k6m9ikzj
🔎 Scanning files...
⬇️ Downloading video_checkpoints/qformer_best.pt...
⬇️ Downloading video_checkpoints/projector_best.pt...
✅ Weights Downloaded.
⏳ Loading Backbones...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

🔄 Loading Weights...

🎥 Generating Video for sequence: [8, 1, 5, 2]
🧠 Running Inference...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


------------------------------
Prompt: Numbers in video:
Truth : 8, 1, 5, 2
Model : 8, 1, 5, 2
------------------------------
