<a href="https://colab.research.google.com/github/Dhanush-sai-reddy/llm-runtime-local/blob/main/qwen3multimediaembeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import cv2
import numpy as np
from PIL import Image
from typing import TypedDict
from langgraph.graph import StateGraph, END
from sentence_transformers import SentenceTransformer

vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
vl_proc = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)

embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

class State(TypedDict):
    video_path: str
    query: str
    frames: list
    descriptions: list
    embeddings: list
    results: str

def extract_frames(state: State):
    cap = cv2.VideoCapture(state["video_path"])
    frames = []
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    interval = int(video_fps)
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % interval == 0:
            frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
        count += 1

    cap.release()
    print(f"Extracted {len(frames)} frames")
    return {"frames": frames}

def describe_frames(state: State):
    descriptions = []
    for i, frame in enumerate(state["frames"]):
        msgs = [{"role": "user", "content": [{"type": "image", "image": frame}, {"type": "text", "text": "Describe this."}]}]
        txt = vl_proc.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        img_in, vid_in = process_vision_info(msgs)
        inputs = vl_proc(text=[txt], images=img_in, videos=vid_in, padding=True, return_tensors="pt").to("cuda")

        with torch.no_grad():
            out = vl_model.generate(**inputs, max_new_tokens=100)
        desc = vl_proc.batch_decode(out, skip_special_tokens=True)[0].split("assistant")[-1].strip()
        descriptions.append(desc)
        print(f"Frame {i}: {desc[:80]}...")

    return {"descriptions": descriptions}

def create_embeddings(state: State):
    embeddings = embed_model.encode(state["descriptions"])
    return {"embeddings": embeddings}

def search_and_answer(state: State):
    query_emb = embed_model.encode([state["query"]])[0]

    sims = [np.dot(query_emb, e) / (np.linalg.norm(query_emb) * np.linalg.norm(e)) for e in state["embeddings"]]
    top3 = np.argsort(sims)[-3:][::-1]

    results = f"Query: {state['query']}\n\n"
    for idx in top3:
        results += f"Frame {idx}: {state['descriptions'][idx][:100]}\nSimilarity: {sims[idx]:.3f}\n\n"

    return {"results": results}

workflow = StateGraph(State)
workflow.add_node("extract", extract_frames)
workflow.add_node("describe", describe_frames)
workflow.add_node("embed", create_embeddings)
workflow.add_node("search", search_and_answer)

workflow.set_entry_point("extract")
workflow.add_edge("extract", "describe")
workflow.add_edge("describe", "embed")
workflow.add_edge("embed", "search")
workflow.add_edge("search", END)

app = workflow.compile()

result = app.invoke({
    "video_path": "/content/video.mp4",
    "query": "What is happening?"
})
#video-
print(result["results"])