# Frame Context Indexer Colab Notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install openai fastapi uvicorn pyngrok transformers torch faiss-cpu

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [3]:
from google.colab import userdata
from google.colab.userdata import SecretNotFoundError
from pyngrok import ngrok

try:
    token = userdata.get("NGROK_AUTH_TOKEN")
    ngrok.set_auth_token(token)
    print("✅ NGROK_AUTH_TOKEN was loaded!")
except SecretNotFoundError:
    print("⚠️ NGROK_AUTH_TOKEN not found. Skipping auth-token setup.")

# HuggingFace tokken catch
try:
    hf_token = userdata.get("HF_TOKEN")
    os.environ["HF_TOKEN"] = hf_token
    print("✅ HF_TOKEN was loaded!")
except SecretNotFoundError:
    raise RuntimeError(
        "⚠️ HF_TOKEN not found. Skipping auth-token setup."
    )

✅ NGROK_AUTH_TOKEN was loaded!


In [4]:
import os
import json
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel
import faiss
import numpy as np

class FrameContextIndexer:
    def __init__(self, keyframes_dir):
        self.keyframes_dir = keyframes_dir
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # force safetensors path
        self.blip_model = BlipForConditionalGeneration.from_pretrained(
            "Salesforce/blip-image-captioning-base",
            torch_dtype=torch.float32,
            use_safetensors=True
        ).to(self.device)

        # similarly for processor if you want fast transforms
        self.processor = BlipProcessor.from_pretrained(
            "Salesforce/blip-image-captioning-base",
            use_fast=True
        )
        # CLIP for embeddings
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(self.device)

    def caption_image(self, image_path):
        image = Image.open(image_path).convert('RGB')
        inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
        out = self.blip_model.generate(**inputs)
        caption = self.blip_processor.decode(out[0], skip_special_tokens=True)
        return caption

    def embed_text(self, text):
        inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True).to(self.device)
        with torch.no_grad():
            emb = self.clip_model.get_text_features(**inputs)
        return emb.cpu().numpy()[0]

    def process_video(self, video_id):
        frame_dir = os.path.join(self.keyframes_dir, video_id)
        metadata_path = os.path.join(frame_dir, "metadata.json")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(f"Metadata not found for video {video_id}")

        with open(metadata_path, "r") as f:
            metadata = json.load(f)
        timestamps = metadata.get("timestamps", [])

        records = []
        for ts in timestamps:
            filename = ts["filename"]
            timestamp = ts["timestamp"]
            frame_path = os.path.join(frame_dir, filename)
            if not os.path.exists(frame_path):
                continue
            caption = self.caption_image(frame_path)
            embedding = self.embed_text(caption)
            records.append({
                "filename": filename,
                "timestamp": timestamp,
                "caption": caption,
                "embedding": embedding.tolist()
            })
        # Save records for later use
        with open(os.path.join(frame_dir, "frame_context.json"), "w") as f:
            json.dump(records, f, indent=2)
        # Build FAISS index
        self.build_faiss_index(video_id, records)
        return records

    def build_faiss_index(self, video_id, records):
        dim = len(records[0]["embedding"])
        index = faiss.IndexFlatL2(dim)
        embeddings = np.array([r["embedding"] for r in records]).astype("float32")
        index.add(embeddings)
        faiss.write_index(index, os.path.join(self.keyframes_dir, video_id, "faiss.index"))

    def query(self, video_id, query_text, top_k=3):
        frame_dir = os.path.join(self.keyframes_dir, video_id)
        context_path = os.path.join(frame_dir, "frame_context.json")
        index_path = os.path.join(frame_dir, "faiss.index")
        if not os.path.exists(context_path) or not os.path.exists(index_path):
            raise FileNotFoundError("Frame context or index not found. Run process_video first.")

        with open(context_path, "r") as f:
            records = json.load(f)
        index = faiss.read_index(index_path)
        query_emb = self.embed_text(query_text).astype("float32").reshape(1, -1)
        D, I = index.search(query_emb, top_k)
        results = [records[i] for i in I[0]]
        return results

In [None]:
# Example: Index and query frame context for a video
video_id = 'sample'
indexer = FrameContextIndexer('/content/drive/MyDrive/keyframes')
records = indexer.process_video(video_id)
print("Indexed records:", records)

query = "A person is running"
results = indexer.query(video_id, query, top_k=3)
print("Query results:", results)

FileNotFoundError: Metadata not found for video sample

In [5]:
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import uvicorn
import nest_asyncio
from pyngrok import ngrok

app = FastAPI()
indexer = FrameContextIndexer('/content/drive/MyDrive/keyframes')

@app.post("/frame_context/{video_id}/index")
async def index_frame_context(video_id: str):
    try:
        records = indexer.process_video(video_id)
        return {"message": f"Frame context indexed for {video_id}", "records": records}
    except Exception as e:
        return JSONResponse({"error": str(e)}, status_code=500)

@app.post("/frame_context/{video_id}/query")
async def query_frame_context(video_id: str, request: Request):
    data = await request.json()
    query = data.get("query")
    top_k = data.get("top_k", 3)
    try:
        results = indexer.query(video_id, query, top_k)
        return {"results": results}
    except Exception as e:
        return JSONResponse({"error": str(e)}, status_code=500)

# Start ngrok tunnel and server
ngrok_tunnel = ngrok.connect(8000)
print("Public URL:", ngrok_tunnel.public_url)

nest_asyncio.apply()
uvicorn.run(app, port=8000)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Public URL: https://388fc4a05f02.ngrok-free.app


INFO:     Started server process [747]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [747]


In [6]:
print(f"Example curl (index):\ncurl -X POST {ngrok_tunnel.public_url}/frame_context/sample/index")
print(f"Example curl (query):\ncurl -X POST {ngrok_tunnel.public_url}/frame_context/sample/query -H 'Content-Type: application/json' -d '{{\"query\": \"A person is running\", \"top_k\": 3}}'")

Example curl (index):
curl -X POST https://388fc4a05f02.ngrok-free.app/frame_context/sample/index
Example curl (query):
curl -X POST https://388fc4a05f02.ngrok-free.app/frame_context/sample/query -H 'Content-Type: application/json' -d '{"query": "A person is running", "top_k": 3}'
