In [1]:
import os
import json
import cv2
import re
import torch
import clip
from PIL import Image
import torchvision.transforms as transforms

# Load CLIP model and preprocessing function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Define directories
data_dir = "data"
videos_dir = os.path.join(data_dir, "videos")
captions_dir = os.path.join(data_dir, "captions")

# Function to extract YouTube video ID from filename
def extract_video_id(filename):
    match = re.search(r"\[([A-Za-z0-9_-]+)\]", filename)  # Extracts text inside brackets [videoID]
    return match.group(1) if match else None

# Get first available video file
video_files = [f for f in os.listdir(videos_dir) if f.endswith(".mp4")]
if not video_files:
    print("No video files found!")
    exit()

video_file = video_files[0]  # Grab the first video
video_path = os.path.join(videos_dir, video_file)

# Find corresponding caption file
video_id = extract_video_id(video_file)
if not video_id:
    print(f"Could not extract video ID from {video_file}")
    exit()

caption_file = next((f for f in os.listdir(captions_dir) if video_id in f and f.endswith(".json")), None)
if not caption_file:
    print(f"No matching caption file found for {video_file}")
    exit()

caption_path = os.path.join(captions_dir, caption_file)

# Load the caption JSON
with open(caption_path, "r", encoding="utf-8") as f:
    captions = json.load(f)

# Grab the first timestamp and its frames
if not captions:
    print("Caption file is empty!")
    exit()

first_entry = captions[0]
start_time = first_entry["start_time"]
end_time = first_entry["end_time"]
caption_text = first_entry["caption"]
frame_indices = first_entry.get("frames", [])

if not frame_indices:
    print("No frame indices found in the first caption entry!")
    exit()

print(f"\nSanity Check: {video_file}")
print(f"Caption: \"{caption_text}\"")
print(f"Timestamp: {start_time} → {end_time}")
print(f"Frame Indices: {frame_indices}")

# Extract frames and convert them into tensors
cap = cv2.VideoCapture(video_path)
frame_tensors = []

for frame_idx in frame_indices:
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
    ret, frame = cap.read()
    
    if ret:
        # Convert BGR (OpenCV) to RGB (PIL)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        
        # Apply CLIP preprocessing
        frame_tensor = preprocess(pil_image).unsqueeze(0).to(device)  # Add batch dimension
        frame_tensors.append(frame_tensor)
    else:
        print(f"❌ Failed to retrieve frame {frame_idx}")

cap.release()

# Stack frames into a single tensor (batch)
if frame_tensors:
    frames_tensor = torch.cat(frame_tensors, dim=0)
    print(f"Frames tensor shape: {frames_tensor.shape}")  # (batch_size, 3, 224, 224)
else:
    print("❌ No valid frames were processed.")



Sanity Check: Cal Poly Survivor： S3 E10： Loved Ones [Kggc-m8ntVQ].mp4
Caption: "previously on C paully"
Timestamp: 00:00:00,359 → 00:00:02,350
Frame Indices: [10, 25, 40, 55, 70]
Frames tensor shape: torch.Size([5, 3, 224, 224])


In [4]:
import torch.nn.functional as F
import torch.optim as optim

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-6)  # Small LR to fine-tune

num_epochs = 2
# Fine-tuning loop
for epoch in range(num_epochs):
    total_loss = 0
    for frame in frames_tensor:
        frame = frame.unsqueeze(0).to(device)
        caption = clip.tokenize([caption_text]).to(device)

        optimizer.zero_grad()  # Reset gradients

        # Encode image and text
        image_features = model.encode_image(frame)
        text_features = model.encode_text(caption)

        # Normalize
        image_features = F.normalize(image_features, dim=-1)
        text_features = F.normalize(text_features, dim=-1)


        # Contrastive loss (maximize similarity)
        loss = -F.cosine_similarity(image_features, text_features).mean()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")


Epoch 1/2 - Loss: -1.6611
Epoch 2/2 - Loss: -2.1896


In [None]:
import torch.nn.functional as F
import torch.optim as optim

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-6)  # Small LR to fine-tune

num_epochs = 10
frames_tensor = frames_tensor.to(device)  # Move all frames to device once
captions_tokenized = clip.tokenize([caption_text] * frames_tensor.shape[0]).to(device)  # Repeat caption

# Fine-tuning loop
for epoch in range(num_epochs):
    optimizer.zero_grad()  # Reset gradients

    # Encode all images and text in a single batch
    image_features = model.encode_image(frames_tensor)
    text_features = model.encode_text(captions_tokenized)

    # Normalize
    image_features = F.normalize(image_features, dim=-1)
    text_features = F.normalize(text_features, dim=-1)

    # Contrastive loss (maximize similarity)
    #TODO add contrastive loss as this was mentioned in the github for this model
    loss = -F.cosine_similarity(image_features, text_features).mean()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}")


Epoch 1/10 - Loss: -0.6126
Epoch 2/10 - Loss: -0.6434
Epoch 3/10 - Loss: -0.6847
Epoch 4/10 - Loss: -0.7088
Epoch 5/10 - Loss: -0.7417
Epoch 6/10 - Loss: -0.7683
Epoch 7/10 - Loss: -0.7877
Epoch 8/10 - Loss: -0.8100
Epoch 9/10 - Loss: -0.8290
Epoch 10/10 - Loss: -0.8451
