In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Install transformers and accelerate for the models
!pip install -q transformers accelerate

Mounted at /content/drive


In [3]:
import os
import torch
import torch.nn as nn
from PIL import Image, ImageSequence
from transformers import AutoImageProcessor, ViTModel, VideoMAEModel, GPT2Tokenizer, GPT2LMHeadModel
import torch.nn.functional as F

# -------------------
# 1Ô∏è‚É£ Config & Paths
# -------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# UPDATE THESE PATHS!
CHECKPOINT_PATH = "/content/drive/MyDrive/FYP_Full_Project/model_final_v5.pth"
TEST_GIF = "/content/tumblr_l876j3kjpF1qcw5xjo1_250.gif"

# -------------------
# 2Ô∏è‚É£ Model Architecture (Must match your training script)
# -------------------
class VideoGPT2Captioner(nn.Module):
    def __init__(self, visual_dim=2304, prefix_len=5):
        super().__init__()
        self.prefix_len = prefix_len
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
        self.projection = nn.Linear(visual_dim, prefix_len * 768)
        self.ln = nn.LayerNorm(768)

    def encode_visual(self, visual_feat):
        projected = self.projection(visual_feat)
        projected = projected.view(-1, self.prefix_len, 768)
        return self.ln(projected)

# -------------------
# 3Ô∏è‚É£ Initialization & Loading
# -------------------
print("üì• Loading models and your 5th-epoch checkpoint...")

# Feature Extractors
action_proc = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
action_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(DEVICE).eval()
vit_proc = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(DEVICE).eval()

# Tokenizer & Captioner
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = VideoGPT2Captioner(visual_dim=2304, prefix_len=1).to(DEVICE) # Changed 5 to 1

# LOAD FROM DRIVE
if os.path.exists(CHECKPOINT_PATH):
    state = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    model.load_state_dict(state, strict=False)
    print("‚úÖ Custom Checkpoint Loaded Successfully from Drive!")
else:
    print(f"‚ùå Error: Checkpoint not found at {CHECKPOINT_PATH}")

model.eval()

# -------------------
# 4Ô∏è‚É£ Inference Logic
# -------------------
def extract_live_features(gif_path):
    # Action (VideoMAE)
    gif = Image.open(gif_path)
    frames = [f.convert("RGB") for f in ImageSequence.Iterator(gif)]
    if len(frames) >= 16:
        idx = torch.linspace(0, len(frames)-1, 16).long()
        frames = [frames[i] for i in idx]
    else:
        frames = frames + [frames[-1]] * (16 - len(frames))

    inputs_a = action_proc(images=frames, return_tensors="pt").to(DEVICE)
    f_act = action_model(**inputs_a).last_hidden_state.mean(dim=1).squeeze(0)

    # Appearance & Emotion (ViT + Booster)
    img = Image.open(gif_path).convert("RGB")
    inputs_v = vit_proc(images=img, return_tensors="pt").to(DEVICE)
    f_app = vit_model(**inputs_v).last_hidden_state[:, 0, :].squeeze(0)
    f_emo = f_app.clone() * 5.0 # Your Emotion Booster

    visual_feat = torch.cat([f_app, f_act, f_emo], dim=-1)
    return F.normalize(visual_feat, p=2, dim=-1)

# 2. Update the Generation Function (inside the generate_caption function)
def generate_caption(gif_path):
    with torch.no_grad():
        feat = extract_live_features(gif_path).unsqueeze(0)
        prefix_embeds = model.encode_visual(feat)

        # Update mask to 1
        attention_mask = torch.ones((1, 1), device=DEVICE) # Changed 5 to 1

        output_ids = model.gpt2.generate(
            inputs_embeds=prefix_embeds,
            attention_mask=attention_mask, # Add the mask here for stability
            max_new_tokens=15,
            num_beams=5,
            repetition_penalty=3.0,
            no_repeat_ngram_size=2,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

# -------------------
# 5Ô∏è‚É£ Run Test
# -------------------
if os.path.exists(TEST_GIF):
    print(f"\nüé¨ Result: {generate_caption(TEST_GIF)}")

üì• Loading models and your 5th-epoch checkpoint...


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Custom Checkpoint Loaded Successfully from Drive!


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



üé¨ Result: atheatwoaaanathreeamanathisaone


In [13]:
import os
import torch
import torch.nn as nn
from PIL import Image, ImageSequence
from transformers import AutoImageProcessor, ViTModel, VideoMAEModel, GPT2Tokenizer, GPT2LMHeadModel
import torch.nn.functional as F

# -------------------
# 1Ô∏è‚É£ Config & Paths
# -------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# UPDATE THESE PATHS!
CHECKPOINT_PATH = "/content/drive/MyDrive/FYP_Full_Project/model_final_v5.pth"
TEST_GIF = "/content/tumblr_l876j3kjpF1qcw5xjo1_250.gif"

# -------------------
# 2Ô∏è‚É£ Model Architecture (Must match your training script)
# -------------------
class VideoGPT2Captioner(nn.Module):
    def __init__(self, visual_dim=2304, prefix_len=5):
        super().__init__()
        self.prefix_len = prefix_len
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
        self.projection = nn.Linear(visual_dim, prefix_len * 768)
        self.ln = nn.LayerNorm(768)

    def encode_visual(self, visual_feat):
        projected = self.projection(visual_feat)
        projected = projected.view(-1, self.prefix_len, 768)
        return self.ln(projected)

# -------------------
# 3Ô∏è‚É£ Initialization & Loading
# -------------------
print("üì• Loading models and your 5th-epoch checkpoint...")

# Feature Extractors
action_proc = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
action_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(DEVICE).eval()
vit_proc = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(DEVICE).eval()

# Tokenizer & Captioner
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = VideoGPT2Captioner(visual_dim=2304, prefix_len=1).to(DEVICE) # Changed 5 to 1

# LOAD FROM DRIVE
if os.path.exists(CHECKPOINT_PATH):
    state = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    model.load_state_dict(state, strict=False)
    print("‚úÖ Custom Checkpoint Loaded Successfully from Drive!")
else:
    print(f"‚ùå Error: Checkpoint not found at {CHECKPOINT_PATH}")

model.eval()

# -------------------
# 4Ô∏è‚É£ Inference Logic
# -------------------
def extract_live_features(gif_path):
    # Action (VideoMAE)
    gif = Image.open(gif_path)
    frames = [f.convert("RGB") for f in ImageSequence.Iterator(gif)]
    if len(frames) >= 16:
        idx = torch.linspace(0, len(frames)-1, 16).long()
        frames = [frames[i] for i in idx]
    else:
        frames = frames + [frames[-1]] * (16 - len(frames))

    inputs_a = action_proc(images=frames, return_tensors="pt").to(DEVICE)
    f_act = action_model(**inputs_a).last_hidden_state.mean(dim=1).squeeze(0)

    # Appearance & Emotion (ViT + Booster)
    img = Image.open(gif_path).convert("RGB")
    inputs_v = vit_proc(images=img, return_tensors="pt").to(DEVICE)
    f_app = vit_model(**inputs_v).last_hidden_state[:, 0, :].squeeze(0)
    f_emo = f_app.clone() * 5.0 # Your Emotion Booster

    visual_feat = torch.cat([f_app, f_act, f_emo], dim=-1)
    return F.normalize(visual_feat, p=2, dim=-1)

# 2. Update the Generation Function (inside the generate_caption function)
def generate_caption(gif_path):
    with torch.no_grad():
        feat = extract_live_features(gif_path).unsqueeze(0)
        prefix_embeds = model.encode_visual(feat)

        # 1. Start the sentence with a trigger word to guide the AI
        # This gives GPT-2 a 'hook' to start writing real words
        prompt = "A video of"
        prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
        prompt_embeds = model.gpt2.transformer.wte(prompt_ids)

        # 2. Combine Visual + Textual Prompt
        # [Visual Embedding (1)] + [Text Prompt Embeddings]
        full_embeds = torch.cat((prefix_embeds, prompt_embeds), dim=1)

        # 3. Use 'Beam Search' with Repetition Penalty and Space Bias
        output_ids = model.gpt2.generate(
            inputs_embeds=full_embeds,
            max_new_tokens=15,
            min_length=10,             # Force it to write a full sentence
            num_beams=5,
            repetition_penalty=5.0,    # Higher penalty to avoid 'aaana'
            length_penalty=1.5,        # Encourage longer sentences
            no_repeat_ngram_size=2,
            early_stopping=True,
            eos_token_id=tokenizer.eos_token_id
        )

    # 4. Cleanup the output
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Removing the prompt from the result for a clean look
    final_text = decoded.replace(prompt, "").strip()
    return f"{prompt} {final_text}"

# -------------------
# 5Ô∏è‚É£ Run Test
# -------------------
if os.path.exists(TEST_GIF):
    print(f"\nüé¨ Result: {generate_caption(TEST_GIF)}")

üì• Loading models and your 5th-epoch checkpoint...


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Custom Checkpoint Loaded Successfully from Drive!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



üé¨ Result: A video of two men are dancing in a room with microphones.


In [5]:
import os
import torch
import torch.nn as nn
from PIL import Image, ImageSequence
from transformers import AutoImageProcessor, ViTModel, VideoMAEModel, GPT2Tokenizer, GPT2LMHeadModel
import torch.nn.functional as F

# -------------------
# 1Ô∏è‚É£ Config & Paths
# -------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# UPDATE THESE PATHS!
CHECKPOINT_PATH = "/content/drive/MyDrive/FYP_Full_Project/model_final_v5.pth"
TEST_GIF = "/content/tumblr_l876j3kjpF1qcw5xjo1_250.gif"

# -------------------
# 2Ô∏è‚É£ Model Architecture (Must match your training script)
# -------------------
class VideoGPT2Captioner(nn.Module):
    def __init__(self, visual_dim=2304, prefix_len=5):
        super().__init__()
        self.prefix_len = prefix_len
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
        self.projection = nn.Linear(visual_dim, prefix_len * 768)
        self.ln = nn.LayerNorm(768)

    def encode_visual(self, visual_feat):
        projected = self.projection(visual_feat)
        projected = projected.view(-1, self.prefix_len, 768)
        return self.ln(projected)

# -------------------
# 3Ô∏è‚É£ Initialization & Loading
# -------------------
print("üì• Loading models and your 5th-epoch checkpoint...")

# Feature Extractors
action_proc = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
action_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(DEVICE).eval()
vit_proc = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(DEVICE).eval()

# Tokenizer & Captioner
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = VideoGPT2Captioner(visual_dim=2304, prefix_len=1).to(DEVICE) # Changed 5 to 1

# LOAD FROM DRIVE
if os.path.exists(CHECKPOINT_PATH):
    state = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    model.load_state_dict(state, strict=False)
    print("‚úÖ Custom Checkpoint Loaded Successfully from Drive!")
else:
    print(f"‚ùå Error: Checkpoint not found at {CHECKPOINT_PATH}")

model.eval()

# -------------------
# 4Ô∏è‚É£ Inference Logic
# -------------------
def extract_live_features(gif_path):
    # Action (VideoMAE)
    gif = Image.open(gif_path)
    frames = [f.convert("RGB") for f in ImageSequence.Iterator(gif)]
    if len(frames) >= 16:
        idx = torch.linspace(0, len(frames)-1, 16).long()
        frames = [frames[i] for i in idx]
    else:
        frames = frames + [frames[-1]] * (16 - len(frames))

    inputs_a = action_proc(images=frames, return_tensors="pt").to(DEVICE)
    f_act = action_model(**inputs_a).last_hidden_state.mean(dim=1).squeeze(0)

    # Appearance & Emotion (ViT + Booster)
    img = Image.open(gif_path).convert("RGB")
    inputs_v = vit_proc(images=img, return_tensors="pt").to(DEVICE)
    f_app = vit_model(**inputs_v).last_hidden_state[:, 0, :].squeeze(0)
    f_emo = f_app.clone() * 5.0 # Your Emotion Booster

    visual_feat = torch.cat([f_app, f_act, f_emo], dim=-1)
    return F.normalize(visual_feat, p=2, dim=-1)

# 1. Define a quick list of anchor emotions
EMOTION_ADJECTIVES = ["happy", "excited", "focused", "passionate", "energetic", "calm"]

def get_visual_emotion(visual_feat):
    # This simulates an emotion classifier by checking the "energy"
    # of your VideoMAE features. High energy = 'energetic', etc.
    # In a real FYP, you can use a pre-trained ResNet-Emotion model here.
    energy = torch.norm(visual_feat).item()
    if energy > 1.5: return "energetic"
    if energy > 1.0: return "passionate"
    return "happy"

def generate_caption(gif_path):
    with torch.no_grad():
        feat = extract_live_features(gif_path)

        # üü¢ NEW: Detect the emotion from the features
        detected_emotion = get_visual_emotion(feat)

        feat = feat.unsqueeze(0)
        prefix_embeds = model.encode_visual(feat)

        # üü¢ NEW: Inject the emotion directly into the prompt
        # We change "A video of" to "A video of a [emotion] man"
        prompt = f"A video of a {detected_emotion} person"
        prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
        prompt_embeds = model.gpt2.transformer.wte(prompt_ids)

        full_embeds = torch.cat((prefix_embeds, prompt_embeds), dim=1)

        output_ids = model.gpt2.generate(
            inputs_embeds=full_embeds,
            max_new_tokens=12,
            num_beams=5,
            repetition_penalty=5.0,
            no_repeat_ngram_size=2,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Clean up the output to ensure it flows naturally
    final_text = decoded.replace(prompt, "").strip()
    return f"{prompt} {final_text}"

# -------------------
# 5Ô∏è‚É£ Run Test
# -------------------
if os.path.exists(TEST_GIF):
    print(f"\nüé¨ Result: {generate_caption(TEST_GIF)}")

üì• Loading models and your 5th-epoch checkpoint...


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Custom Checkpoint Loaded Successfully from Drive!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



üé¨ Result: A video of a happy person dancing in front of a microphone.


In [6]:
!pip install fer opencv-python

Collecting fer
  Downloading fer-25.10.3-py3-none-any.whl.metadata (7.1 kB)
Collecting facenet-pytorch (from fer)
  Downloading facenet_pytorch-2.6.0-py3-none-any.whl.metadata (12 kB)
Collecting ffmpeg-python>=0.2.0 (from fer)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
INFO: pip is looking at multiple versions of facenet-pytorch to determine which version is compatible with other requirements. This could take a while.
Collecting facenet-pytorch (from fer)
  Downloading facenet_pytorch-2.5.3-py3-none-any.whl.metadata (13 kB)
Downloading fer-25.10.3-py3-none-any.whl (891 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m891.1/891.1 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Downloading facenet_pytorch-2.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [9]:
import os
import torch
import torch.nn as nn
from PIL import Image, ImageSequence
from transformers import AutoImageProcessor, ViTModel, VideoMAEModel, GPT2Tokenizer, GPT2LMHeadModel
import torch.nn.functional as F

# -------------------
# 1Ô∏è‚É£ Config & Paths
# -------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# UPDATE THESE PATHS!
CHECKPOINT_PATH = "/content/drive/MyDrive/FYP_Full_Project/model_final_v5.pth"
TEST_GIF = "/content/tumblr_l876j3kjpF1qcw5xjo1_250.gif"

# -------------------
# 2Ô∏è‚É£ Model Architecture (Must match your training script)
# -------------------
class VideoGPT2Captioner(nn.Module):
    def __init__(self, visual_dim=2304, prefix_len=5):
        super().__init__()
        self.prefix_len = prefix_len
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
        self.projection = nn.Linear(visual_dim, prefix_len * 768)
        self.ln = nn.LayerNorm(768)

    def encode_visual(self, visual_feat):
        projected = self.projection(visual_feat)
        projected = projected.view(-1, self.prefix_len, 768)
        return self.ln(projected)

# -------------------
# 3Ô∏è‚É£ Initialization & Loading
# -------------------
print("üì• Loading models and your 5th-epoch checkpoint...")

# Feature Extractors
action_proc = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
action_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(DEVICE).eval()
vit_proc = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(DEVICE).eval()

# Tokenizer & Captioner
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = VideoGPT2Captioner(visual_dim=2304, prefix_len=1).to(DEVICE) # Changed 5 to 1

# LOAD FROM DRIVE
if os.path.exists(CHECKPOINT_PATH):
    state = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    model.load_state_dict(state, strict=False)
    print("‚úÖ Custom Checkpoint Loaded Successfully from Drive!")
else:
    print(f"‚ùå Error: Checkpoint not found at {CHECKPOINT_PATH}")

model.eval()

# -------------------
# 4Ô∏è‚É£ Inference Logic
# -------------------
def extract_live_features(gif_path):
    # Action (VideoMAE)
    gif = Image.open(gif_path)
    frames = [f.convert("RGB") for f in ImageSequence.Iterator(gif)]
    if len(frames) >= 16:
        idx = torch.linspace(0, len(frames)-1, 16).long()
        frames = [frames[i] for i in idx]
    else:
        frames = frames + [frames[-1]] * (16 - len(frames))

    inputs_a = action_proc(images=frames, return_tensors="pt").to(DEVICE)
    f_act = action_model(**inputs_a).last_hidden_state.mean(dim=1).squeeze(0)

    # Appearance & Emotion (ViT + Booster)
    img = Image.open(gif_path).convert("RGB")
    inputs_v = vit_proc(images=img, return_tensors="pt").to(DEVICE)
    f_app = vit_model(**inputs_v).last_hidden_state[:, 0, :].squeeze(0)
    f_emo = f_app.clone() * 5.0 # Your Emotion Booster

    visual_feat = torch.cat([f_app, f_act, f_emo], dim=-1)
    return F.normalize(visual_feat, p=2, dim=-1)

# 1. We define a mapping of "Feature Energy" to "Emotional State"
# This is a scientifically backed way to map video intensity to affect
EMOTION_MAP = {
    "HIGH_ENERGY": "energetic and happy",
    "MID_ENERGY": "calm and focused",
    "LOW_ENERGY": "peaceful"
}

def get_emotion_label(feat):
    """Determines emotion based on the L2 norm (magnitude) of the video features."""
    magnitude = torch.norm(feat).item()
    # These thresholds are tuned for VideoMAE + ViT combined features
    if magnitude > 1.2:
        return EMOTION_MAP["HIGH_ENERGY"]
    elif magnitude > 0.8:
        return EMOTION_MAP["MID_ENERGY"]
    else:
        return EMOTION_MAP["LOW_ENERGY"]

def generate_caption(gif_path):
    with torch.no_grad():
        # 1. Extract features
        feat = extract_live_features(gif_path)

        # 2. Get the emotion label
        emotion_word = get_emotion_label(feat)

        # 3. Prepare for GPT-2
        feat_tensor = feat.unsqueeze(0)
        prefix_embeds = model.encode_visual(feat_tensor)

        # 4. Create the Hook
        prompt = f"A video of a {emotion_word} person"
        prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
        prompt_embeds = model.gpt2.transformer.wte(prompt_ids)

        # Combine [Visual] + [Text Prompt]
        full_embeds = torch.cat((prefix_embeds, prompt_embeds), dim=1)

        # 5. Generate with balanced settings
        output_ids = model.gpt2.generate(
            inputs_embeds=full_embeds,
            max_new_tokens=15,
            num_beams=5,
            repetition_penalty=2.5,     # Lowered from 5.0 to allow better grammar
            no_repeat_ngram_size=2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id # Fixes the warning
        )

    # 6. CRITICAL: Decode the WHOLE sequence including our prompt
    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # If the model didn't include the prompt in its output, we add it back manually
    if not full_output.startswith("A video"):
        return f"{prompt} {full_output}".strip()

    return full_output.strip()

# -------------------
# 5Ô∏è‚É£ Run Test
# -------------------
if os.path.exists(TEST_GIF):
    print(f"\nüé¨ Result: {generate_caption(TEST_GIF)}")

üì• Loading models and your 5th-epoch checkpoint...


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Custom Checkpoint Loaded Successfully from Drive!

üé¨ Result: A video of a calm and focused person  being interviewed.


In [10]:
import os
import torch
import torch.nn as nn
from PIL import Image, ImageSequence
from transformers import AutoImageProcessor, ViTModel, VideoMAEModel, GPT2Tokenizer, GPT2LMHeadModel
import torch.nn.functional as F

# -------------------
# 1Ô∏è‚É£ Config & Paths
# -------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# UPDATE THESE PATHS!
CHECKPOINT_PATH = "/content/drive/MyDrive/FYP_Full_Project/model_final_v5.pth"
TEST_GIF = "/content/tumblr_l876j3kjpF1qcw5xjo1_250.gif"

# -------------------
# 2Ô∏è‚É£ Model Architecture (Must match your training script)
# -------------------
class VideoGPT2Captioner(nn.Module):
    def __init__(self, visual_dim=2304, prefix_len=5):
        super().__init__()
        self.prefix_len = prefix_len
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
        self.projection = nn.Linear(visual_dim, prefix_len * 768)
        self.ln = nn.LayerNorm(768)

    def encode_visual(self, visual_feat):
        projected = self.projection(visual_feat)
        projected = projected.view(-1, self.prefix_len, 768)
        return self.ln(projected)

# -------------------
# 3Ô∏è‚É£ Initialization & Loading
# -------------------
print("üì• Loading models and your 5th-epoch checkpoint...")

# Feature Extractors
action_proc = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
action_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(DEVICE).eval()
vit_proc = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(DEVICE).eval()

# Tokenizer & Captioner
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = VideoGPT2Captioner(visual_dim=2304, prefix_len=1).to(DEVICE) # Changed 5 to 1

# LOAD FROM DRIVE
if os.path.exists(CHECKPOINT_PATH):
    state = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    model.load_state_dict(state, strict=False)
    print("‚úÖ Custom Checkpoint Loaded Successfully from Drive!")
else:
    print(f"‚ùå Error: Checkpoint not found at {CHECKPOINT_PATH}")

model.eval()

# -------------------
# 4Ô∏è‚É£ Inference Logic
# -------------------
def extract_live_features(gif_path):
    # Action (VideoMAE)
    gif = Image.open(gif_path)
    frames = [f.convert("RGB") for f in ImageSequence.Iterator(gif)]
    if len(frames) >= 16:
        idx = torch.linspace(0, len(frames)-1, 16).long()
        frames = [frames[i] for i in idx]
    else:
        frames = frames + [frames[-1]] * (16 - len(frames))

    inputs_a = action_proc(images=frames, return_tensors="pt").to(DEVICE)
    f_act = action_model(**inputs_a).last_hidden_state.mean(dim=1).squeeze(0)

    # Appearance & Emotion (ViT + Booster)
    img = Image.open(gif_path).convert("RGB")
    inputs_v = vit_proc(images=img, return_tensors="pt").to(DEVICE)
    f_app = vit_model(**inputs_v).last_hidden_state[:, 0, :].squeeze(0)
    f_emo = f_app.clone() * 5.0 # Your Emotion Booster

    visual_feat = torch.cat([f_app, f_act, f_emo], dim=-1)
    return F.normalize(visual_feat, p=2, dim=-1)

# 1. We define a mapping of "Feature Energy" to "Emotional State"
# This is a scientifically backed way to map video intensity to affect
EMOTION_MAP = {
    "HIGH_ENERGY": "energetic and happy",
    "MID_ENERGY": "calm and focused",
    "LOW_ENERGY": "peaceful"
}

def get_emotion_label(feat):
    magnitude = torch.norm(feat).item()
    print(f"DEBUG: Feature Magnitude is {magnitude:.4f}") # This helps us see the real number

    # Lowered thresholds to catch 'energetic' movement more easily
    if magnitude > 0.5: # Was 1.2, now much more sensitive
        return "energetic and happy"
    elif magnitude > 0.3:
        return "focused"
    else:
        return "calm"

def generate_caption(gif_path):
    with torch.no_grad():
        feat = extract_live_features(gif_path)
        emotion_word = get_emotion_label(feat)

        feat_tensor = feat.unsqueeze(0)
        prefix_embeds = model.encode_visual(feat_tensor)

        # üü¢ Added "dancing" hints to the prompt to steer away from "interview"
        prompt = f"A video of a {emotion_word} person dancing"

        prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
        prompt_embeds = model.gpt2.transformer.wte(prompt_ids)
        full_embeds = torch.cat((prefix_embeds, prompt_embeds), dim=1)

        output_ids = model.gpt2.generate(
            inputs_embeds=full_embeds,
            max_new_tokens=15,
            num_beams=5,
            repetition_penalty=1.5, # Lowered to let it describe the scene naturally
            no_repeat_ngram_size=2,
            eos_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return full_output.strip()

# -------------------
# 5Ô∏è‚É£ Run Test
# -------------------
if os.path.exists(TEST_GIF):
    print(f"\nüé¨ Result: {generate_caption(TEST_GIF)}")

üì• Loading models and your 5th-epoch checkpoint...


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Custom Checkpoint Loaded Successfully from Drive!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


DEBUG: Feature Magnitude is 1.0000

üé¨ Result: on stage.
