In [5]:
import os
import cv2
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer

# Download NLTK data
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Abhishek
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to extract frames from video
def extract_frames(video_path, num_frames=16):
    frames = []
    if not os.path.exists(video_path):
        print(f"Error: Video file does not exist {video_path}")
        return None
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return None
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(total_frames // num_frames, 1)
    for i in range(0, total_frames, interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(cv2.resize(frame, (224, 224)))
        if len(frames) == num_frames:
            break
    cap.release()
    if len(frames) < num_frames:
        print(f"Warning: Video {video_path} has fewer frames than expected ({len(frames)} out of {num_frames})")
        return None
    return frames


In [7]:
# Load the fine-tuned model, tokenizer, and feature extractor
model_save_path = 'copy_fine_tuned_vit_gpt2'
model = VisionEncoderDecoderModel.from_pretrained(model_save_path)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model.to(device)
model.eval()

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [30]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
# Load fine-tuned model
model_save_path = 'fine_tuned_vit_gpt2'
model = VisionEncoderDecoderModel.from_pretrained(model_save_path)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.eval()

def generate_caption(model, feature_extractor, tokenizer, frames):
    inputs = feature_extractor(images=frames, return_tensors="pt")
    pixel_values = inputs.pixel_values.to(device)
    output_ids = model.generate(pixel_values)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Example usage:
video_path = 'videos_1000/1033277738.mp4'   
frames = extract_frames(video_path)
if frames:
    caption = generate_caption(model, feature_extractor, tokenizer, frames)
    print(f"Generated Caption: {caption}")
else:
    print("Could not generate caption due to insufficient frames or read error.")

Generated Caption: A beautiful young girl in the field with her hands in the air. beautiful girl in the field


In [31]:
# %% Custom BLEU score calculation without using Fraction
from nltk.util import ngrams
from collections import Counter
import math

def custom_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)):
    p_numerators = [0] * len(weights)
    p_denominators = [0] * len(weights)
    
    for i, weight in enumerate(weights, start=1):
        ref_ngrams = Counter(ngrams(reference, i))
        can_ngrams = Counter(ngrams(candidate, i))
        
        p_numerators[i - 1] = sum((can_ngrams & ref_ngrams).values())
        p_denominators[i - 1] = max(1, sum(can_ngrams.values()))
    
    p_n = [num / den if den > 0 else 0 for num, den in zip(p_numerators, p_denominators)]
    
    if min(p_n) > 0:
        score = math.exp(sum([w * math.log(p_n_i) for w, p_n_i in zip(weights, p_n)]))
    else:
        score = 0
    
    bp = math.exp(1 - max(1.0, len(reference) / len(candidate)))
    
    return bp * score

# %% Function to evaluate model using BLEU score
def evaluate_model(model, feature_extractor, tokenizer, video_paths, actual_captions, device):
    model.eval()
    total_bleu_score = 0
    
    for video_path, true_caption in zip(video_paths, actual_captions):
        frames = extract_frames(video_path)
        if frames:
            generated_caption = generate_caption(model, feature_extractor, tokenizer, frames)
            reference = true_caption.split()
            gen_caption = generated_caption.split()
            
            # Print generated and reference captions for debugging
            print(f"Reference: {reference}")
            print(f"generated_caption: {gen_caption}")
            
            bleu_score = custom_bleu(reference, gen_caption)
            total_bleu_score += bleu_score
    
    if len(video_paths) > 0:
        avg_bleu_score = total_bleu_score / len(video_paths)
    else:
        avg_bleu_score = 0
    return avg_bleu_score


In [33]:
# Example usage:
video_paths = ['videos_1000/1033277738.mp4']  # Add more paths as needed
actual_captions = ["Happy girl in short skirt runs skipping in wheat field at sunset. concept of freedom."]  # actual captions
avg_bleu_score = evaluate_model(model, feature_extractor, tokenizer, video_paths, actual_captions, device)
print(f'Average BLEU Score: {avg_bleu_score}')

Reference: ['Happy', 'girl', 'in', 'short', 'skirt', 'runs', 'skipping', 'in', 'wheat', 'field', 'at', 'sunset.', 'concept', 'of', 'freedom.']
generated_caption: ['A', 'beautiful', 'young', 'girl', 'in', 'the', 'field', 'with', 'her', 'hands', 'in', 'the', 'air.', 'beautiful', 'girl', 'in', 'the', 'field']
Average BLEU Score: 0.0
