In [1]:
import os 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import ast
import json
import torch.nn.utils.rnn as rnn_utils
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch
import random
import nltk

2025-06-04 09:18:19.814146: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-04 09:18:19.844860: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-04 09:18:20.529334: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda')

### Load the preprocessed video corpus and frames metadata

In [4]:
# video_corpus = pd.read_csv('data/final_video_corpus.csv')
video_corpus = pd.read_csv('data/final_video_corpus.csv')
video_corpus['PaddedSequence'] = video_corpus['PaddedSequence'].apply(json.loads)
frames_metadata = pd.read_csv('data/frames_metadata.csv')

In [5]:
video_corpus.head()

Unnamed: 0,VideoID,Description,PaddedSequence
0,mv89psg6zh4_33_46,<bos> a bird is bathing in a sink <eos>,"[3, 2, 253, 5, 554, 9, 2, 465, 4, 0, 0, 0, 0, ..."
1,mv89psg6zh4_33_46,<bos> a bird is splashing around under a runni...,"[3, 2, 253, 5, 1, 81, 318, 2, 47, 903, 4, 0, 0..."
2,mv89psg6zh4_33_46,<bos> a bird is bathing in a sink <eos>,"[3, 2, 253, 5, 554, 9, 2, 465, 4, 0, 0, 0, 0, ..."
3,mv89psg6zh4_33_46,<bos> a faucet is running while a bird stands ...,"[3, 2, 903, 5, 47, 90, 2, 253, 1087, 9, 6, 465..."
4,mv89psg6zh4_33_46,<bos> a bird is playing in a sink with running...,"[3, 2, 253, 5, 11, 9, 2, 465, 15, 47, 32, 4, 0..."


In [6]:
frames_metadata.head()

Unnamed: 0,VideoID,Key_Frames,Total_Frames
0,-4wsuPCjDBc_5_15,0,50
1,-7KMZQEsJW4_205_208,0,50
2,-8y1Q0rA3n8_108_115,3,50
3,-8y1Q0rA3n8_95_102,4,50
4,-9CUm-2cui8_39_44,2,50


#### Expect no duplicates in the frames metadata

In [10]:
if(frames_metadata.duplicated(subset=['VideoID']).any()):
    print("There are duplicates present in the frames_metadata")
else:
    print("There are no duplicates present in the frames_metadata")

There are no duplicates present in the frames_metadata


### Perform train test split 

In [5]:
train_ids, test_ids = train_test_split(frames_metadata['VideoID'].unique(), test_size=0.2, random_state=42)

train_corpus = video_corpus[video_corpus['VideoID'].isin(train_ids)].reset_index(drop=True)
test_corpus = video_corpus[video_corpus['VideoID'].isin(test_ids)].reset_index(drop=True)

### Load visual features

In [6]:
features_dir = 'data/ExtractedFeatures/' 
x_data = {}

for video_id in os.listdir(features_dir):
    subdir = os.path.join(features_dir, video_id)
    if not os.path.isdir(subdir):
        continue
    
    feature_path = os.path.join(subdir, f"{video_id}.npy")

    if os.path.exists(feature_path):
        x_data[video_id] = np.load(feature_path)
    else:
        print(f"Warning: Feature file missing for {video_id}")

In [7]:
video_id = list(x_data.keys())[0]
features = x_data[video_id]
print("Feature shape:", features.shape)

Feature shape: (50, 4096)


In [8]:
valid_video_ids = set(x_data.keys())
train_corpus = train_corpus[train_corpus['VideoID'].isin(valid_video_ids)].reset_index(drop=True)

### Model Architecture and Dataset

In [9]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    
    def forward(self, features):
        # features: (batch, seq_len, input_size)
        outputs, (hidden, cell) = self.lstm(features)
        return hidden, cell

class DecoderLSTM(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_seq, hidden, cell):
        # input_seq: (batch, seq_len)
        embedded = self.embedding(input_seq)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        outputs = self.linear(outputs)
        return outputs, hidden, cell

class VideoCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, features, input_seq):
        hidden, cell = self.encoder(features)
        outputs, _, _ = self.decoder(input_seq, hidden, cell)
        return outputs

In [10]:
class VideoCaptionDataset(Dataset):
    def __init__(self, corpus_df, x_data, tokenizer):
        """
        corpus_df: DataFrame with columns 'VideoID' and 'PaddedSequence' (list of ints)
        x_data: dict mapping VideoID -> np.array (variable length feature seqs)
        tokenizer: tokenizer object (optional, for vocab size etc)
        """
        self.corpus_df = corpus_df
        self.x_data = x_data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.corpus_df)
    
    def __getitem__(self, idx):
        row = self.corpus_df.iloc[idx]
        video_id = row['VideoID']
        
        # Features: shape (seq_len, feature_dim)
        features = torch.tensor(self.x_data[video_id], dtype=torch.float)
        
        # Caption sequence, e.g. [<bos>, w1, w2, ..., <eos>, 0, 0, ...]
        caption = row['PaddedSequence']  # list of ints
        
        caption = torch.tensor(caption, dtype=torch.long)
        
        # Prepare inputs and targets for decoder:
        input_seq = caption[:-1]
        target_seq = caption[1:]
        
        return features, input_seq, target_seq

In [11]:
def collate_fn(batch):
    features, input_seqs, target_seqs = zip(*batch)
    
    # Pad features along seq_len dim
    max_len = max(f.shape[0] for f in features)
    padded_features = []
    for f in features:
        pad_len = max_len - f.shape[0]
        if pad_len > 0:
            padding = torch.zeros(pad_len, f.shape[1], device=f.device)
            f = torch.cat([f, padding], dim=0)
        padded_features.append(f)
    padded_features = torch.stack(padded_features)  # (batch, max_len, feat_dim)
    
    # Pad input and target captions (batch_first=True)
    input_seqs = rnn_utils.pad_sequence(input_seqs, batch_first=True, padding_value=0)
    target_seqs = rnn_utils.pad_sequence(target_seqs, batch_first=True, padding_value=0)
    
    return padded_features, input_seqs, target_seqs

## Training 

In [13]:
import pickle

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [11]:
train_dataset = VideoCaptionDataset(train_corpus, x_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [None]:
input_size = 4096
hidden_size = 512
embed_size = 256
vocab_size = len(tokenizer.word_index) + 1

encoder = EncoderLSTM(input_size, hidden_size).to(device)
decoder = DecoderLSTM(embed_size, hidden_size, vocab_size).to(device)
model = VideoCaptioningModel(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [13]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for features, input_seq, target_seq in dataloader:
        features = features.to(device)
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        
        optimizer.zero_grad()
        outputs = model(features, input_seq)  # (batch, seq_len, vocab_size)
        
        # Flatten outputs and targets for loss
        outputs = outputs.view(-1, outputs.size(-1))
        target_seq = target_seq.view(-1)
        
        loss = criterion(outputs, target_seq)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [49]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for features, input_seq, target_seq in dataloader:
        features = features.to(device)
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        
        optimizer.zero_grad()
        outputs = model(features, input_seq)  # (batch, seq_len, vocab_size)
        
        # Flatten outputs and targets for loss
        outputs = outputs.view(-1, outputs.size(-1))
        target_seq = target_seq.view(-1)
        
        loss = criterion(outputs, target_seq)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

for epoch in range(10):
    loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 1, Loss: 3.1743
Epoch 2, Loss: 2.2833
Epoch 3, Loss: 1.9604
Epoch 4, Loss: 1.7886
Epoch 5, Loss: 1.6800
Epoch 6, Loss: 1.6017
Epoch 7, Loss: 1.5378
Epoch 8, Loss: 1.4850
Epoch 9, Loss: 1.4391
Epoch 10, Loss: 1.3978


In [65]:
model.load_state_dict(torch.load('video_caption_model_1.pth', map_location=device))
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(10, 20):  
    loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

    # torch.save(model.state_dict(), f'video_caption_model_epoch{epoch+1}.pth')

Epoch 11, Loss: 1.3647
Epoch 12, Loss: 1.3234
Epoch 13, Loss: 1.2896
Epoch 14, Loss: 1.2567
Epoch 15, Loss: 1.2262
Epoch 16, Loss: 1.1980
Epoch 17, Loss: 1.1695
Epoch 18, Loss: 1.1432
Epoch 19, Loss: 1.1180
Epoch 20, Loss: 1.0922


In [20]:
model.load_state_dict(torch.load('video_caption_model_2.pth', map_location=device))
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(20, 30):  
    loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 21, Loss: 1.0733
Epoch 22, Loss: 1.0465
Epoch 23, Loss: 1.0235
Epoch 24, Loss: 1.0012
Epoch 25, Loss: 0.9806
Epoch 26, Loss: 0.9606
Epoch 27, Loss: 0.9412
Epoch 28, Loss: 0.9224
Epoch 29, Loss: 0.9044
Epoch 30, Loss: 0.8874


In [46]:
model.load_state_dict(torch.load('video_caption_model_3.pth', map_location=device))
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(30, 40):  
    loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 31, Loss: 0.8756
Epoch 32, Loss: 0.8562
Epoch 33, Loss: 0.8410
Epoch 34, Loss: 0.8262
Epoch 35, Loss: 0.8122
Epoch 36, Loss: 0.8000
Epoch 37, Loss: 0.7876
Epoch 38, Loss: 0.7758
Epoch 39, Loss: 0.7645
Epoch 40, Loss: 0.7542


In [47]:
torch.save(model.state_dict(), 'video_caption_model_4.pth')

## Evaluation

In [15]:
def generate_caption(model, features, tokenizer, max_len=20):
    model.eval()
    features = features.unsqueeze(0).to(device)  # add batch dimension
    
    bos_token = 'bos'  # no angle brackets
    eos_token = 'eos'
    
    with torch.no_grad():
        hidden, cell = model.encoder(features)
        input_seq = torch.tensor([tokenizer.word_index[bos_token]], device=device).unsqueeze(0)
        generated = []
        
        for _ in range(max_len):
            output, hidden, cell = model.decoder(input_seq, hidden, cell)
            output = output.squeeze(1)
            predicted_id = output.argmax(1).item()
            
            if predicted_id == tokenizer.word_index.get(eos_token, 0):
                break
                
            generated.append(predicted_id)
            input_seq = torch.tensor([predicted_id], device=device).unsqueeze(0)
    
    inv_map = {v: k for k, v in tokenizer.word_index.items()}
    caption_words = [inv_map.get(i, '<unk>') for i in generated]
    
    return ' '.join(caption_words)

In [43]:
input_size = 4096
hidden_size = 512
embed_size = 256
vocab_size = len(tokenizer.word_index) + 1

model = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size), DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)
model.load_state_dict(torch.load('video_caption_model_4.pth', map_location=device))

random_video_id = random.choice(test_corpus['VideoID'].unique())
print(f"Random video ID: {random_video_id}")

features = torch.tensor(x_data[random_video_id], dtype=torch.float)
caption = generate_caption(model, features, tokenizer)
print("Generated caption:", caption)

Random video ID: 1N_Ic2pBM1o_2_23
Generated caption: a young man is playing the guitar


In [44]:
input_size = 4096
hidden_size = 512
embed_size = 256
vocab_size = len(tokenizer.word_index) + 1

model = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size), DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)
model.load_state_dict(torch.load('video_caption_model_4.pth', map_location=device))

random_video_id = random.choice(test_corpus['VideoID'].unique())
print(f"Random video ID: {random_video_id}")

features = torch.tensor(x_data[random_video_id], dtype=torch.float)
caption = generate_caption(model, features, tokenizer)
print("Generated caption:", caption)

Random video ID: _6OTzzK7t9Y_158_170
Generated caption: a man is playing a piano


In [19]:
input_size = 4096
hidden_size = 512
embed_size = 256
vocab_size = len(tokenizer.word_index) + 1

model = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size), DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)
model.load_state_dict(torch.load('video_caption_model_1.pth', map_location=device))

for _ in range(10):
    random_video_id = random.choice(test_corpus['VideoID'].unique())
    print(f"Random video ID: {random_video_id}")
    
    features = torch.tensor(x_data[random_video_id], dtype=torch.float)
    caption = generate_caption(model, features, tokenizer)
    print("Generated caption:", caption)

Random video ID: tHLiYTS9Iz8_1_16
Generated caption: a boy is playing with a ball
Random video ID: 2mUMTFnQWaw_1_9
Generated caption: a man is riding a horse
Random video ID: ZvJvNcukZ4w_0_10
Generated caption: a <unk> is eating
Random video ID: eZLxohGP4IE_15_25
Generated caption: a man is cutting a <unk>
Random video ID: 04Gt01vatkk_308_321
Generated caption: a woman is slicing an onion
Random video ID: HZ-BuDDmvVk_0_10
Generated caption: a woman is <unk> a small animal
Random video ID: 5x_OGEdO6Z8_0_21
Generated caption: a man is <unk> a <unk> on a <unk>
Random video ID: kWLNZzuo3do_24_31
Generated caption: a woman is cutting a <unk>
Random video ID: nULE40HEWpA_5_11
Generated caption: a woman is playing with a <unk>
Random video ID: 3SKyc0aKx70_46_52
Generated caption: a man is playing a <unk>


In [16]:
input_size = 4096
hidden_size = 512
embed_size = 256
vocab_size = len(tokenizer.word_index) + 1

model = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size), DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)
model.load_state_dict(torch.load('video_caption_model_4.pth', map_location=device))

for _ in range(10):
    random_video_id = random.choice(test_corpus['VideoID'].unique())
    print(f"Random video ID: {random_video_id}")
    
    features = torch.tensor(x_data[random_video_id], dtype=torch.float)
    caption = generate_caption(model, features, tokenizer)
    print("Generated caption:", caption)

Random video ID: gGDtPJzh_0s_30_45
Generated caption: a person is slicing a piece of bread
Random video ID: lv8d_qLLqsk_1_20
Generated caption: a <unk> <unk> <unk> <unk> <unk> <unk> <unk> to be <unk> to be singing
Random video ID: io2dbV-Qbus_215_247
Generated caption: a man is cutting something
Random video ID: 08pVpBq706k_175_212
Generated caption: a cat is playing with a <unk> of white
Random video ID: dtwXtwJByYk_5_14
Generated caption: a man is <unk>
Random video ID: 08pVpBq706k_175_212
Generated caption: a cat is playing with a <unk> of white
Random video ID: nTasT5h0LEg_12_14
Generated caption: a funny animal
Random video ID: HO_ovIrLWfQ_1_11
Generated caption: a woman is swimming in a water
Random video ID: 9BScZRpF7SI_31_36
Generated caption: a man is playing with <unk>
Random video ID: c_XV7nPoRg8_2_12
Generated caption: a man is playing a <unk> on the <unk>


In [72]:
video_corpus.iloc[62927]

VideoID                                         8MVo7fje_oE_130_136
Description       <bos> a man drains water out of a container of...
PaddedSequence    [3, 2, 7, 1, 32, 57, 14, 2, 319, 14, 336, 4, 0...
Name: 62927, dtype: object

In [None]:
nltk.download('punkt_tab')

In [63]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def evaluate_bleu_score(model, test_corpus, x_data, tokenizer, device):
    model.eval()
    smoothie = SmoothingFunction().method4
    total_bleu = 0
    n_samples = 0

    for i, row in test_corpus.iterrows():
        video_id = row['VideoID']
        if video_id not in x_data:
            continue

        # features = torch.tensor(x_data[video_id][:30], dtype=torch.float).unsqueeze(0).to(device)  # crop to 30 frames
        features = torch.tensor(x_data[video_id], dtype=torch.float).unsqueeze(0).to(device)
        generated = generate_caption(model, features.squeeze(0), tokenizer)  # returns a sentence string

        # Tokenize both predicted and reference sentences
        reference = word_tokenize(row['Description'].replace("<bos>", "").replace("<eos>", "").strip().lower())
        candidate = word_tokenize(generated.strip().lower())

        # Compute BLEU score (unigram + bigram)
        bleu_score = sentence_bleu([reference], candidate, smoothing_function=smoothie, weights=(0.5, 0.5))
        total_bleu += bleu_score
        n_samples += 1

    avg_bleu = total_bleu / max(n_samples, 1)
    return avg_bleu

In [64]:
model1 = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size),
                               DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)

model2 = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size),
                               DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)

model3 = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size),
                               DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)

model4 = VideoCaptioningModel(EncoderLSTM(input_size, hidden_size),
                               DecoderLSTM(embed_size, hidden_size, vocab_size)).to(device)

model1.load_state_dict(torch.load('video_caption_model_1.pth', map_location=device))
model2.load_state_dict(torch.load('video_caption_model_2.pth', map_location=device))
model3.load_state_dict(torch.load('video_caption_model_3.pth', map_location=device))
model4.load_state_dict(torch.load('video_caption_model_4.pth', map_location=device))

model1.eval()
model2.eval()
model3.eval()
model4.eval()

bleu1 = evaluate_bleu_score(model1, test_corpus, x_data, tokenizer, device)
bleu2 = evaluate_bleu_score(model2, test_corpus, x_data, tokenizer, device)
bleu3 = evaluate_bleu_score(model3, test_corpus, x_data, tokenizer, device)
bleu4 = evaluate_bleu_score(model4, test_corpus, x_data, tokenizer, device)

print(f"Average BLEU score on test set using model1: {bleu1:.4f}")
print(f"Average BLEU score on test set using model2: {bleu2:.4f}")
print(f"Average BLEU score on test set using model3: {bleu3:.4f}")
print(f"Average BLEU score on test set using model3: {bleu4:.4f}")

Average BLEU score on test set using model1: 0.1734
Average BLEU score on test set using model2: 0.1685
Average BLEU score on test set using model3: 0.1586
Average BLEU score on test set using model3: 0.1548
