In [2]:
print("Hello World!!")

Hello World!!


# Imports

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import ViTFeatureExtractor, ViTModel, BertTokenizer, BertModel
import os
import json
import cv2
import pandas as pd
from torchvision import transforms
from PIL import Image

# Define Classes

In [7]:
class VideoEncoder(nn.Module):
    def __init__(self, pretrained_model_name, hidden_size):
        super(VideoEncoder, self).__init__()
        self.feature_extractor = ViTFeatureExtractor.from_pretrained(pretrained_model_name)
        self.vit_model = ViTModel.from_pretrained(pretrained_model_name)
        self.hidden_size = hidden_size
    
    def forward(self, video_frames):
        # video_frames: (batch_size, num_frames, channels, height, width)
        
        batch_size, num_frames, _, _, _ = video_frames.size()
        
        # Reshape video_frames to (batch_size * num_frames, channels, height, width)
        video_frames = video_frames.view(-1, *video_frames.shape[2:])
        
        # Extract features using ViT
        inputs = self.feature_extractor(images=video_frames, return_tensors="pt")
        inputs = {key: value.to(video_frames.device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = self.vit_model(**inputs)
        
        # Extract the features from the model's output
        features = outputs.last_hidden_state  # (batch_size * num_frames, seq_len, hidden_size)
        
        # Reshape features to (batch_size, num_frames, seq_len, hidden_size)
        features = features.view(batch_size, num_frames, *features.shape[1:])
        
        return features


In [15]:
class TextEncoder(nn.Module):
    def __init__(self, pretrained_model_name, hidden_size):
        super(TextEncoder, self).__init__()
        self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.bert_model = BertModel.from_pretrained(pretrained_model_name)
        self.hidden_size = hidden_size
    
    def forward(self, captions):
        # captions: (batch_size, seq_len)
        
        # Tokenize captions and get BERT embeddings
        input_ids = captions
        attention_mask = (input_ids != 0).float()  # Create attention mask (0 indicates padding)
        
        with torch.no_grad():
            outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract the BERT embeddings from the model's output
        embeddings = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        
        return embeddings

In [24]:
def similarity_loss(video_encoded, text_encoded):
    # Normalize the encodings
    video_encoded = F.normalize(video_encoded, p=2, dim=-1)
    text_encoded = F.normalize(text_encoded, p=2, dim=-1)

    # Calculate cosine similarity
    similarity = torch.matmul(video_encoded, text_encoded.transpose(1, 2))

    # Calculate cross-entropy loss
    # In this case, you want to maximize similarity, so use -log(probability) as the loss
    loss = -torch.log(similarity + 1e-8)  # Adding a small epsilon to avoid log(0)

    # Calculate the mean loss over the batch
    loss = torch.mean(loss)

    return loss

In [25]:
class VideoCaptioningModel(nn.Module):
    def __init__(self, video_encoder, text_encoder):
        super(VideoCaptioningModel, self).__init__()
        self.video_encoder = video_encoder
        self.text_encoder = text_encoder

    def forward(self, video_features, captions):
        video_encoded = self.video_encoder(video_features)
        text_encoded = self.text_encoder(captions)
        similarity = similarity_loss(video_encoded, text_encoded)
        return similarity

model = VideoCaptioningModel(video_encoder, text_encoder)

In [76]:
# Define a custom dataset class for video-caption pairs
class VideoCaptionDataset(Dataset):
    def __init__(self, json_path, video_folder, transform=None):
        self.video_folder = video_folder
        self.transform = transform
        self.data = pd.DataFrame(self.load_json_data(json_path)["sentences"])
        self.data.set_index('video_id', inplace=True)

    def __len__(self):
        return len(self.data.index.unique())

    def __getitem__(self, idx):
        video_idx = f'video{idx}'
        video_path = os.path.join(self.video_folder, f'{video_idx}.mp4')
        captions = self.data.loc[video_idx]["caption"].tolist()

        # Load video frames and apply transformations
        video_frames = self.load_video_frames(video_path)

        if self.transform:
            video_frames = [self.transform(frame) for frame in video_frames]

        return video_frames, captions

    def load_json_data(self, json_path):
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)
        return data

    def load_video_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(videopath)

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            # Convert frame to PIL image
            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.BGR2RGB))
            frames.append(frame_pil)

        cap.release()
        return frames

# Setup video encoder

In [77]:
# Define transformations for video frames (you can customize these)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Define paths and create data loaders for training and validation
json_path = 'train_val_annotation/train_val_videodatainfo.json'  # Path to your JSON file
video_folder = 'TrainValVideo'  # Path to the folder containing video files

dataset = VideoCaptionDataset(json_path, video_folder, transform=transform)

# Split the dataset into training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 32  # Adjust as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [78]:
video_encoder = VideoEncoder(pretrained_model_name="google/vit-base-patch16-224-in21k", hidden_size=768)
text_encoder = TextEncoder(pretrained_model_name="bert-base-uncased", hidden_size=768) 



In [65]:
model = VideoCaptioningModel(video_encoder, text_encoder)
criterion = nn.MSELoss()  # You can use any suitable loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [66]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        video_features, captions = batch
        optimizer.zero_grad()

        similarity = model(video_features, captions)

        # Backpropagation
        similarity.backward()
        optimizer.step()

        total_loss += similarity.item()

        average_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}, Loss: {average_loss:.4f}')



KeyError: 'video69796'

In [30]:
json_path = "train_val_annotation/train_val_videodatainfo.json"
with open(json_path, 'r') as json_file:
    data = json.load(json_file)


In [42]:
data["sentences"][:5]

[{'caption': 'a cartoon animals runs through an ice cave in a video game',
  'video_id': 'video2960',
  'sen_id': 0},
 {'caption': 'a cartoon character runs around inside of a video game',
  'video_id': 'video2960',
  'sen_id': 1},
 {'caption': 'a character is running in the snow',
  'video_id': 'video2960',
  'sen_id': 2},
 {'caption': 'a person plays a video game centered around ice age the movie',
  'video_id': 'video2960',
  'sen_id': 3},
 {'caption': 'a person plays online and records themselves',
  'video_id': 'video2960',
  'sen_id': 4}]

In [44]:
df = pd.DataFrame(data["sentences"])
df

Unnamed: 0,caption,video_id,sen_id
0,a cartoon animals runs through an ice cave in ...,video2960,0
1,a cartoon character runs around inside of a vi...,video2960,1
2,a character is running in the snow,video2960,2
3,a person plays a video game centered around ic...,video2960,3
4,a person plays online and records themselves,video2960,4
...,...,...,...
140195,two soldiers speak to a camera outside,video140,140195
140196,two soldiers talking about another soldier,video140,140196
140197,two soldiers talking on duty,video140,140197
140198,two troops speak with one another,video140,140198


In [45]:
df.set_index('video_id', inplace=True)
df

Unnamed: 0_level_0,caption,sen_id
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1
video2960,a cartoon animals runs through an ice cave in ...,0
video2960,a cartoon character runs around inside of a vi...,1
video2960,a character is running in the snow,2
video2960,a person plays a video game centered around ic...,3
video2960,a person plays online and records themselves,4
...,...,...
video140,two soldiers speak to a camera outside,140195
video140,two soldiers talking about another soldier,140196
video140,two soldiers talking on duty,140197
video140,two troops speak with one another,140198


In [75]:
len(df.index.unique())

7010

In [69]:
df2 = pd.DataFrame(data["videos"])
df2

Unnamed: 0,category,url,video_id,start time,end time,split,id
0,9,https://www.youtube.com/watch?v=9lZi22qLlEo,video0,137.72,149.44,train,0
1,16,https://www.youtube.com/watch?v=w4JM08PDEng,video1,184.33,206.89,train,1
2,9,https://www.youtube.com/watch?v=QA7KVQq9vKA,video2,31.17,41.24,train,2
3,8,https://www.youtube.com/watch?v=QFmJZ0GU6yc,video3,48.26,58.51,train,3
4,14,https://www.youtube.com/watch?v=2q-dONPhzis,video4,268.58,278.83,train,4
...,...,...,...,...,...,...,...
7005,1,https://www.youtube.com/watch?v=cvzoU0yy73s,video7005,1028.68,1039.29,validate,7005
7006,7,https://www.youtube.com/watch?v=8vIzjd9ceDY,video7006,300.40,311.96,validate,7006
7007,14,https://www.youtube.com/watch?v=1Ov5f-B_pqg,video7007,1086.57,1098.48,validate,7007
7008,11,https://www.youtube.com/watch?v=wcZNNvMM9Jk,video7008,88.44,104.72,validate,7008


In [62]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f5f0efcdb50>