In [2]:
# Run this cell first (single bash cell).
# Minimal, Kaggle-friendly installs. Adjust versions if Kaggle changes infra.
!pip install --upgrade pip

# PyTorch - use Kaggle's preinstalled torch if available; else, install a matching wheel.
# On Kaggle, the preinstalled torch should be fine. If you want specific CUDA wheel, uncomment appropriate line.
# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118

# open_clip (LAION implementation)
!pip install open_clip_torch

# OCR + ASR + client
!pip install easyocr
!pip install faster-whisper
!pip install pillow numpy scipy opencv-python-headless psutil

# OpenAI client (used with Friendli style base_url)
!pip install openai

# Optional: if you later enable detectron2 / APE, install per repo instructions; heavy.


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting faster-whisper
  Downloading faster_whisper-1.2.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metada

In [25]:
import os, sys, json, time, math, base64
from pathlib import Path
import numpy as np
import cv2
import torch
import psutil

# Cache & data dirs (Kaggle working area)
BASE = Path("/kaggle/working/data/cache")
FRAMES_DIR = BASE / "frames"
CLIP_DIR = BASE / "clip_embeddings"
APE_DIR = BASE / "ape"
OCR_DIR = BASE / "ocr"
ASR_DIR = BASE / "asr"
RETR_DIR = BASE / "retrieval"
LOG_DIR = BASE / "logs"
VIDEOS = BASE / "videos"

for p in [FRAMES_DIR, CLIP_DIR, APE_DIR, OCR_DIR, ASR_DIR, RETR_DIR, LOG_DIR, VIDEOS]:
    p.mkdir(parents=True, exist_ok=True)

# Keep HF/torch caches local (avoid filling system cache)
os.environ["HF_HOME"] = "/kaggle/working/hf"
os.environ["TRANSFORMERS_CACHE"] = "/kaggle/working/hf"
os.environ["HF_DATASETS_CACHE"] = "/kaggle/working/hf"
os.environ["TORCH_HOME"] = "/kaggle/working/torch"
os.environ["FRIENDLI_API_KEY"] = "flp_q0uKwZrrCQKnzUIiqLxul0Nk2qRE3dEvkSBJ9O3hQGw9d"
# Friendli/OpenAI key from Kaggle Secrets or env
FRIENDLI_API_KEY = os.environ.get("FRIENDLI_API_KEY", None)
if FRIENDLI_API_KEY is None:
    print("WARNING: FRIENDLI_API_KEY not set. Add via Kaggle Secrets or export env var.")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)
print("RAM available (GB):", psutil.virtual_memory().available/1e9)


Device: cuda
RAM available (GB): 27.915464704


In [23]:
os.environ["FRIENDLI_API_KEY"] = "flp_Od6MuNzJzZQX3tNYu5OntJCKZxTrytnIvErxRJveGyHwd4"

In [5]:
import time, json
def save_json(obj, path):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    return str(path)

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_log(video_name, level, module, msg):
    p = LOG_DIR / f"{video_name}.log"
    rec = {"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "level":level, "module":module, "msg":msg}
    with open(p, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec) + "\n")

def np_savez(path, **kwargs):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    np.savez_compressed(path, **kwargs)
    return str(path)


In [6]:
def sample_frames(video_path, video_name, max_frames=32):
    out_dir = FRAMES_DIR / video_name
    out_dir.mkdir(parents=True, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    if total == 0:
        raise RuntimeError("Video has zero frames or cannot be opened.")
    idxs = [min(total-1, int(i * total / max_frames)) for i in range(max_frames)]
    saved = []
    for i, idx in enumerate(idxs):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ok, frame = cap.read()
        if not ok:
            continue
        path = out_dir / f"frame_{i:04d}.jpg"
        cv2.imwrite(str(path), frame)  # cv2 writes BGR, but downstream loads expect RGB or convert as needed
        saved.append(str(path))
    cap.release()
    save_log(video_name, "INFO", "frame_sampler", f"sampled {len(saved)} frames")
    return saved


In [7]:
# open_clip usage
import open_clip
from PIL import Image
import numpy as np
import torch

def run_openclip_and_save(video_name, frame_paths, model_name="ViT-L-14", pretrained="laion2b_s32b_b82k", batch_size=8):
    """
    Computes image embeddings using open_clip. Saves embeddings as .npz at CLIP_DIR/<video_name>.npz
    """
    device = DEVICE
    # model selection fallback: try large then base
    model_candidates = [(model_name, pretrained), ("ViT-H-14", "laion2b_s32b_b79k"), ("ViT-B-32", "laion2b_s34b_b79k")]
    model = None
    for name, pre in model_candidates:
        try:
            print(f"Trying open_clip model {name} pretrained={pre} on device={device}")
            model, _, preprocess = open_clip.create_model_and_transforms(name, pretrained=pre)
            tokenizer = open_clip.get_tokenizer(name)
            model.to(device)
            model.eval()
            break
        except Exception as e:
            print(f"Failed to load open_clip model {name} ({pre}): {e}")
            model = None
    if model is None:
        raise RuntimeError("Failed to load any open_clip model")

    if device == "cuda":
        model = model.half()  # use FP16 on GPU

    all_embs = []
    frame_idxs = []
    for i in range(0, len(frame_paths), batch_size):
        batch_paths = frame_paths[i:i+batch_size]
        imgs = [preprocess(Image.open(p).convert("RGB")).unsqueeze(0) for p in batch_paths]
        tensor = torch.cat(imgs, dim=0).to(device)
        if device == "cuda":
            tensor = tensor.half()
        with torch.no_grad():
            img_feats = model.encode_image(tensor)  # shape (B, D)
            # normalize
            img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True)
            emb = img_feats.cpu().float().numpy()
        all_embs.append(emb)
        frame_idxs.extend(list(range(i, i+len(batch_paths))))
    embeddings = np.vstack(all_embs)
    np_savez(CLIP_DIR / f"{video_name}.npz", embeddings=embeddings.astype(np.float32), frame_indices=np.array(frame_idxs, dtype=int))
    save_log(video_name, "INFO", "open_clip", f"saved embeddings shape={embeddings.shape}")
    # return also tokenizer and model (caller can keep them in memory if desired)
    return embeddings, frame_idxs, model, tokenizer, preprocess

# Example usage:
# frames = sample_frames(VIDEO_PATH, VIDEO_NAME, max_frames=32)
# embeddings, idxs, oc_model, oc_tokenizer, oc_pre = run_openclip_and_save(VIDEO_NAME, frames, batch_size=8)




In [7]:
# !pip install pytesseract

In [8]:
import pytesseract
from PIL import Image

def run_ocr_and_save(video_name, frame_paths):
    # Optional: Point to tesseract executable if not in PATH
    # pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
    
    out = {}
    for i, p in enumerate(frame_paths):
        try:
            # PyTesseract works best with PIL Image objects
            img = Image.open(p)
            # lang='eng' is the default; use config for specific OCR modes
            text = pytesseract.image_to_string(img, lang='eng')
            
            # Splitting by newline to mimic EasyOCR's list output
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            out[i] = lines
        except Exception as e:
            out[i] = []
            
    save_json({"frame_idx_to_text": out}, OCR_DIR / f"{video_name}.json")
    save_log(video_name, "INFO", "ocr", f"ocr frames processed={len(frame_paths)}")
    return out


In [9]:
from faster_whisper import WhisperModel
def run_asr_and_save(video_path, video_name, model_size="tiny"):
    device = "cuda" if DEVICE=="cuda" else "cpu"
    compute_type = "float16" if (device=="cuda") else "float32"
    model = WhisperModel(model_size, device=device, compute_type=compute_type)
    segments, info = model.transcribe(video_path, beam_size=5, vad_filter=True)
    segs = []
    full_text = []
    for seg in segments:
        segs.append({"start": seg.start, "end": seg.end, "text": seg.text})
        full_text.append(seg.text)
    out = {"segments": segs, "raw_text": " ".join(full_text)}
    save_json(out, ASR_DIR / f"{video_name}.json")
    save_log(video_name, "INFO", "asr", f"asr segments={len(segs)} model={model_size}")
    return out


In [11]:
# If you have APE/Detectron2 installed & configured, replace the body with actual inference.
# For modular flow, we only run APE on selected frames (retriever output).
def run_ape_on_frames(video_name, selected_frame_paths):
    # Placeholder: create minimal records for each path
    out = []
    for i, p in enumerate(selected_frame_paths):
        rec = {"frame_path": p, "frame_idx": int(Path(p).stem.split("_")[-1]), "objects": [{"label": "person", "bbox":[0,0,10,10], "confidence":0.9}]}
        out.append(rec)
    save_json(out, APE_DIR / f"{video_name}.json")
    save_log(video_name, "INFO", "ape", f"ape stub saved for {len(selected_frame_paths)} frames")
    return out


In [12]:
fr_model = "depvgl25ul3x6cv"
fr_api = "flp_Od6MuNzJzZQX3tNYu5OntJCKZxTrytnIvErxRJveGyHwd4"

In [13]:
from openai import OpenAI

def call_planner_vlm(question):
    if FRIENDLI_API_KEY is None:
        raise RuntimeError("FRIENDLI_API_KEY not set")
    client = OpenAI(api_key=fr_api, base_url="https://api.friendli.ai/dedicated/v1")
    retrieve_pmt_0 = "Question: " + question + "\nTo answer the question step by step, provide retrieve request in this JSON format: {\"ASR\": Optional[str], \"DET\": Optional[list], \"TYPE\": Optional[list]}.\nReturn only valid JSON."
    response = client.chat.completions.create(
        model=fr_model,
        messages=[{"role":"user","content":retrieve_pmt_0}],
        max_tokens=512,
        temperature=0.0
    )
    planner_text = response.choices[0].message.content
    save_log("planner", "INFO", "planner_call", "got planner response")
    return planner_text


In [14]:
# This uses the open_clip model/tokenizer loaded earlier (if you kept it in a long-running session).
# If not, we will reload a small text encoder variant.
import numpy as np
from pathlib import Path

def retriever_select_frames(video_name, planner_response_text, oc_model=None, oc_tokenizer=None, top_k=4):
    # parse planner response defensively
    try:
        plan = json.loads(planner_response_text)
    except Exception:
        plan = {}
    selected_indices = set()

    # 1) APE mapping (if APE exists)
    det_labels = plan.get("DET") if isinstance(plan.get("DET"), list) else None
    ape_file = APE_DIR / f"{video_name}.json"
    if det_labels and ape_file.exists():
        ape_list = load_json(ape_file)
        for rec in ape_list:
            for o in rec.get("objects", []):
                if o.get("label") in det_labels:
                    # If rec has frame_idx or frame_path parse it
                    if "frame_idx" in rec:
                        selected_indices.add(int(rec["frame_idx"]))
                    else:
                        # try parse from path
                        idx = int(Path(rec["frame_path"]).stem.split("_")[-1])
                        selected_indices.add(idx)

    # 2) CLIP text->image similarity fallback (requires embeddings file)
    if len(selected_indices) == 0:
        clip_npz = CLIP_DIR / f"{video_name}.npz"
        if clip_npz.exists():
            data = np.load(clip_npz)
            img_embs = data["embeddings"]  # (N, D), already normalized in run_openclip
            n_frames = img_embs.shape[0]
            # if planner asked DET labels, embed those labels and compute similarity
            labels = det_labels or []
            if len(labels) > 0:
                # ensure oc_model & tokenizer loaded
                reload_model = False
                if oc_model is None or oc_tokenizer is None:
                    try:
                        oc_model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k")
                        oc_tokenizer = open_clip.get_tokenizer("ViT-B-32")
                        oc_model.to(DEVICE)
                        if DEVICE == "cuda":
                            oc_model = oc_model.half()
                        oc_model.eval()
                        reload_model = True
                    except Exception as e:
                        print("Failed to load text encoder for fallback:", e)
                        oc_model = None
                        oc_tokenizer = None
                if oc_model is not None and oc_tokenizer is not None:
                    try:
                        toks = oc_tokenizer(labels)  # returns tensor
                        toks = toks.to(DEVICE)
                        with torch.no_grad():
                            text_feats = oc_model.encode_text(toks)
                            text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
                            t_np = text_feats.cpu().numpy()
                        img_norm = img_embs / np.linalg.norm(img_embs, axis=1, keepdims=True)
                        sims = img_norm @ (t_np.T)  # (N, len(labels))
                        # choose frames with highest max similarity
                        scores = sims.max(axis=1)
                        top_idx = np.argsort(scores)[::-1][:top_k]
                        for idx in top_idx:
                            selected_indices.add(int(idx))
                    except Exception as e:
                        print("Text->image sim fallback failed:", e)
                # if reload model was True and you want to free memory, you may del oc_model
                if reload_model:
                    try:
                        del oc_model
                        torch.cuda.empty_cache()
                    except:
                        pass

    # 3) final uniform fallback
    if len(selected_indices) == 0:
        clip_npz = CLIP_DIR / f"{video_name}.npz"
        if clip_npz.exists():
            data = np.load(clip_npz)
            n = data["embeddings"].shape[0]
            cand = [0, max(0,n//3), max(0,2*n//3), n-1]
            selected_indices.update([c for c in cand if c < n])
        else:
            frames = sorted((FRAMES_DIR / video_name).glob("*.jpg"))
            selected_indices.update(list(range(min(4, len(frames)))))

    sel = sorted(list(selected_indices))[:top_k]
    sel_paths = [str(FRAMES_DIR / video_name / f"frame_{i:04d}.jpg") for i in sel]
    out = {
        "selected_frame_indices": sel,
        "selected_frame_paths": sel_paths,
        "planner_response": planner_response_text
    }
    save_json(out, RETR_DIR / f"{video_name}.json")
    save_log(video_name, "INFO", "retriever", f"selected indices: {sel}")
    return out


In [15]:
from openai import OpenAI

def encode_image_b64(path):
    with open(path,"rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def call_final_vlm(video_name, question, retrieval_out):
    if FRIENDLI_API_KEY is None:
        raise RuntimeError("FRIENDLI_API_KEY not set")
    client = OpenAI(api_key=fr_api, base_url="https://api.friendli.ai/dedicated/v1")
    selected_paths = retrieval_out["selected_frame_paths"]
    content = [{"type":"text","text": question}]
    for p in selected_paths:
        content.append({"type":"image_url", "image_url":{"url": f"data:image/jpeg;base64,{encode_image_b64(p)}"}})
    resp = client.chat.completions.create(
        model=fr_model,
        messages=[{"role":"user","content": content}],
        max_tokens=800,
        temperature=0.2
    )
    answer = resp.choices[0].message.content
    save_json({"answer": answer, "selected_frame_paths": selected_paths}, RETR_DIR / f"{video_name}_final_answer.json")
    save_log(video_name, "INFO", "final_vlm", "final answer saved")
    return answer


In [16]:
# planner_resp = call_planner_vlm(QUESTION)

In [17]:
# print(planner_resp)

In [18]:
# !touch '/kaggle/working/data/cache/videos/christmas_tree.mp4'

In [19]:
# !cp '/kaggle/input/test-video/christmas_tree.mp4' '/kaggle/working/data/cache/videos/christmas_tree.mp4'

In [20]:
# Inputs
VIDEO_PATH = "/kaggle/working/data/cache/videos/christmas_tree.mp4"  # change to your video
VIDEO_NAME = "christmas_tree"
QUESTION = "How many apples are on the chirstmas tree?"

In [35]:
# 1) sample frames
frames = sample_frames(VIDEO_PATH, VIDEO_NAME, max_frames=700)
print("Sampled frames:", len(frames))

# 2) CLIP embeddings using open_clip
# returns model/tokenizer/preprocess too, but we don't keep model global unless needed
embs, idxs, oc_model, oc_tokenizer, oc_pre = run_openclip_and_save(VIDEO_NAME, frames, model_name="ViT-L-14", pretrained="laion2b_s32b_b82k", batch_size=8)
print("CLIP embeddings shape:", embs.shape)

# 3) OCR
ocr_out = run_ocr_and_save(VIDEO_NAME, frames)
print("OCR done")

# 4) ASR (small recommended)
asr_out = run_asr_and_save(VIDEO_PATH, VIDEO_NAME, model_size="small")
print("ASR done")


Sampled frames: 700
Trying open_clip model ViT-L-14 pretrained=laion2b_s32b_b82k on device=cuda
CLIP embeddings shape: (700, 768)
OCR done
ASR done


In [36]:
QUESTION = "How many apples are on the chirstmas tree?"
# 5) Planner (remote)
planner_resp = call_planner_vlm(QUESTION)
print("Planner response:", planner_resp)

# 6) Retriever (uses APE if present else CLIP fallback)
retrieval = retriever_select_frames(VIDEO_NAME, planner_resp, oc_model=None, oc_tokenizer=None, top_k=4)
print("Selected frames:", retrieval["selected_frame_paths"])

# 7) Optionally run APE on selected frames (replace stub with real APE if available)
ape_res = run_ape_on_frames(VIDEO_NAME, retrieval["selected_frame_paths"])
print("APE (stub) done")

# 8) Final VLM call
answer = call_final_vlm(VIDEO_NAME, QUESTION, retrieval)
print("FINAL ANSWER:\n", answer)


Planner response: ```json
{
  "ASR": "There are 24 apples on the Christmas tree.",
  "DET": ["Christmas tree"],
  "TYPE": ["Number of apples"]
}
```
Selected frames: ['/kaggle/working/data/cache/frames/christmas_tree/frame_0000.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0233.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0466.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0699.jpg']
APE (stub) done
FINAL ANSWER:
 The Christmas tree in the video has 6 apples on it.


In [37]:
# NEW QUESTION
QUESTION = "What objects are visible near the center of the video?"

VIDEO_NAME = "christmas_tree"

# 1) Planner
planner_resp = call_planner_vlm(QUESTION)
print("Planner response:\n", planner_resp)

# 2) Retrieval
retrieval = retriever_select_frames(
    VIDEO_NAME,
    planner_resp,
    oc_model=None,
    oc_tokenizer=None,
    top_k=4
)
print("Selected frames:", retrieval["selected_frame_paths"])

# 3) Final VLM reasoning
answer = call_final_vlm(VIDEO_NAME, QUESTION, retrieval)
print("\nFINAL ANSWER:\n", answer)


Planner response:
 {"ASR": "a", "DET": ["a"], "TYPE": ["object"]}
Text->image sim fallback failed: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 512 is different from 768)
Selected frames: ['/kaggle/working/data/cache/frames/christmas_tree/frame_0000.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0233.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0466.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0699.jpg']

FINAL ANSWER:
 The objects visible near the center of the video are:

1. A Christmas tree decorated with ornaments and lights.
2. A wooden ornament with the "13abc NEWSNOW" logo on it.
3. A drawing of a Christmas tree with people gathered around it.


In [38]:
QUESTION = "When demonstrating the Germany modern Christmas tree is initially decorated with apples, candles and berries,which kind of the decoration has the largest number? OPTIONS: A. Apples. B. Candles. C. Berries. D. The three kinds are of the same number."

VIDEO_NAME = "christmas_tree"

# 1) Planner
planner_resp = call_planner_vlm(QUESTION)
print("Planner response:\n", planner_resp)

# 2) Retrieval
retrieval = retriever_select_frames(
    VIDEO_NAME,
    planner_resp,
    oc_model=None,
    oc_tokenizer=None,
    top_k=4
)
print("Selected frames:", retrieval["selected_frame_paths"])

# 3) Final VLM reasoning
answer = call_final_vlm(VIDEO_NAME, QUESTION, retrieval)
print("\nFINAL ANSWER:\n", answer)


Planner response:
 ```json
{
  "ASR": "berries",
  "DET": ["berries"],
  "TYPE": ["apples", "candles", "berries"]
}
```
Selected frames: ['/kaggle/working/data/cache/frames/christmas_tree/frame_0000.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0233.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0466.jpg', '/kaggle/working/data/cache/frames/christmas_tree/frame_0699.jpg']

FINAL ANSWER:
 The decoration that has the largest number is the apples.
