In [1]:
print("Hello World!")

Hello World!


In [2]:
import json
import os

In [3]:

# Specify the path to your JSON file
json_file_path = "train_val_annotation/train_val_videodatainfo.json"

try:
    # Get the file size in bytes
    file_size_bytes = os.path.getsize(json_file_path)

    # Convert bytes to a more human-readable format (e.g., megabytes)
    file_size_megabytes = file_size_bytes / (1024 * 1024)  # 1 MB = 1024 KB, 1 KB = 1024 bytes

    print(f"The file size of '{json_file_path}' is approximately {file_size_megabytes:.2f} megabytes.")
except FileNotFoundError:
    print(f"The file {json_file_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

The file size of 'train_val_annotation/train_val_videodatainfo.json' is approximately 15.40 megabytes.


# Main Code starts here

# Read the data

In [4]:
try:
    with open(json_file_path, "r", encoding="utf-8") as json_file:
        json_data = json.load(json_file)
except FileNotFoundError:
    print(f"The file {json_file_path} was not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [5]:
json_data.keys()

dict_keys(['info', 'videos', 'sentences'])

In [6]:
json_data["info"] # useless

{'contributor': 'Microsoft MSM group',
 'data_created': '2016-04-14 14:30:20',
 'version': '1.0',
 'description': 'This is 1.0 version of the 2016 MSR-VTT dataset.',
 'year': '2016'}

In [7]:
json_data["videos"][:5]

[{'category': 9,
  'url': 'https://www.youtube.com/watch?v=9lZi22qLlEo',
  'video_id': 'video0',
  'start time': 137.72,
  'end time': 149.44,
  'split': 'train',
  'id': 0},
 {'category': 16,
  'url': 'https://www.youtube.com/watch?v=w4JM08PDEng',
  'video_id': 'video1',
  'start time': 184.33,
  'end time': 206.89,
  'split': 'train',
  'id': 1},
 {'category': 9,
  'url': 'https://www.youtube.com/watch?v=QA7KVQq9vKA',
  'video_id': 'video2',
  'start time': 31.17,
  'end time': 41.24,
  'split': 'train',
  'id': 2},
 {'category': 8,
  'url': 'https://www.youtube.com/watch?v=QFmJZ0GU6yc',
  'video_id': 'video3',
  'start time': 48.26,
  'end time': 58.51,
  'split': 'train',
  'id': 3},
 {'category': 14,
  'url': 'https://www.youtube.com/watch?v=2q-dONPhzis',
  'video_id': 'video4',
  'start time': 268.58,
  'end time': 278.83,
  'split': 'train',
  'id': 4}]

In [8]:
json_data["sentences"][:5]

[{'caption': 'a cartoon animals runs through an ice cave in a video game',
  'video_id': 'video2960',
  'sen_id': 0},
 {'caption': 'a cartoon character runs around inside of a video game',
  'video_id': 'video2960',
  'sen_id': 1},
 {'caption': 'a character is running in the snow',
  'video_id': 'video2960',
  'sen_id': 2},
 {'caption': 'a person plays a video game centered around ice age the movie',
  'video_id': 'video2960',
  'sen_id': 3},
 {'caption': 'a person plays online and records themselves',
  'video_id': 'video2960',
  'sen_id': 4}]

In [9]:
len(json_data["sentences"]), len(json_data["videos"])

(140200, 7010)

In [10]:
int(140200/7010) # 20 captions per video

20

In [11]:
sample_json_data = {
    "info": json_data["info"],
    "videos": json_data["videos"][:5],
    "sentences": json_data["sentences"][:5]
}

In [12]:
sample_json_data

{'info': {'contributor': 'Microsoft MSM group',
  'data_created': '2016-04-14 14:30:20',
  'version': '1.0',
  'description': 'This is 1.0 version of the 2016 MSR-VTT dataset.',
  'year': '2016'},
 'videos': [{'category': 9,
   'url': 'https://www.youtube.com/watch?v=9lZi22qLlEo',
   'video_id': 'video0',
   'start time': 137.72,
   'end time': 149.44,
   'split': 'train',
   'id': 0},
  {'category': 16,
   'url': 'https://www.youtube.com/watch?v=w4JM08PDEng',
   'video_id': 'video1',
   'start time': 184.33,
   'end time': 206.89,
   'split': 'train',
   'id': 1},
  {'category': 9,
   'url': 'https://www.youtube.com/watch?v=QA7KVQq9vKA',
   'video_id': 'video2',
   'start time': 31.17,
   'end time': 41.24,
   'split': 'train',
   'id': 2},
  {'category': 8,
   'url': 'https://www.youtube.com/watch?v=QFmJZ0GU6yc',
   'video_id': 'video3',
   'start time': 48.26,
   'end time': 58.51,
   'split': 'train',
   'id': 3},
  {'category': 14,
   'url': 'https://www.youtube.com/watch?v=2q-dON

In [13]:
sample_json_data["videos"]

[{'category': 9,
  'url': 'https://www.youtube.com/watch?v=9lZi22qLlEo',
  'video_id': 'video0',
  'start time': 137.72,
  'end time': 149.44,
  'split': 'train',
  'id': 0},
 {'category': 16,
  'url': 'https://www.youtube.com/watch?v=w4JM08PDEng',
  'video_id': 'video1',
  'start time': 184.33,
  'end time': 206.89,
  'split': 'train',
  'id': 1},
 {'category': 9,
  'url': 'https://www.youtube.com/watch?v=QA7KVQq9vKA',
  'video_id': 'video2',
  'start time': 31.17,
  'end time': 41.24,
  'split': 'train',
  'id': 2},
 {'category': 8,
  'url': 'https://www.youtube.com/watch?v=QFmJZ0GU6yc',
  'video_id': 'video3',
  'start time': 48.26,
  'end time': 58.51,
  'split': 'train',
  'id': 3},
 {'category': 14,
  'url': 'https://www.youtube.com/watch?v=2q-dONPhzis',
  'video_id': 'video4',
  'start time': 268.58,
  'end time': 278.83,
  'split': 'train',
  'id': 4}]

In [14]:
sample_json_data["sentences"]

[{'caption': 'a cartoon animals runs through an ice cave in a video game',
  'video_id': 'video2960',
  'sen_id': 0},
 {'caption': 'a cartoon character runs around inside of a video game',
  'video_id': 'video2960',
  'sen_id': 1},
 {'caption': 'a character is running in the snow',
  'video_id': 'video2960',
  'sen_id': 2},
 {'caption': 'a person plays a video game centered around ice age the movie',
  'video_id': 'video2960',
  'sen_id': 3},
 {'caption': 'a person plays online and records themselves',
  'video_id': 'video2960',
  'sen_id': 4}]

In [15]:
os.listdir("TrainValVideo")[:5]

['video5351.mp4',
 'video1149.mp4',
 'video5207.mp4',
 'video20.mp4',
 'video12.mp4']

# Preprocessing Video Frames

In [16]:
!pip install --upgrade pip
!pip install opencv-python



In [17]:
import cv2
import torch

In [18]:
def preprocess_frame(frame):
    # Resize frame to a fixed size (e.g., 224x224)
    frame = cv2.resize(frame, (224, 224))
    # Normalize pixel values to [0, 1] and convert to PyTorch tensor
    frame = torch.tensor(frame / 255.0, dtype=torch.float32)
    return frame

# Load a video and preprocess its frames
cap = cv2.VideoCapture("TrainValVideo/video0.mp4")
frames = []

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame = preprocess_frame(frame)
    frames.append(frame)

cap.release()

Data Preparation:

Download the Dataset: First, download the MSR-VTT dataset, which includes video files and associated captions.

Preprocess Video Frames: Extract video frames from the video files. Depending on your chosen video feature extraction approach, you may need to sample frames at regular intervals, extract optical flow, or use other techniques to preprocess the videos.

Tokenize Captions: Tokenize the captions into subword tokens using a suitable tokenizer. BERT models typically work with subword tokens, and you can use libraries like Hugging Face Transformers for tokenization.

Create Data Splits: Divide the dataset into training, validation, and test sets.

Feature Extraction:

Video Feature Extraction: Extract video features from the preprocessed video frames using a Vision Transformer (ViT) model. You can use pre-trained ViT models for this purpose. Some libraries like Hugging Face Transformers may provide pre-trained ViT models that can be fine-tuned for feature extraction on your specific dataset.

Text Embeddings: Convert the tokenized captions into BERT embeddings. You can use a pre-trained BERT model to encode the captions into dense representations.

Model Architecture:

Combine Vision and Text: Create a model architecture that combines the extracted video features and BERT embeddings. This can be done through various fusion techniques, such as concatenation, attention mechanisms, or multimodal embeddings.

Caption Generation: Design a caption generation model (e.g., recurrent neural network or transformer-based) that takes the fused features as input and generates captions one token at a time.

Training:

Loss Function: Define an appropriate loss function for training the caption generation model. Common choices include cross-entropy loss or custom loss functions that encourage diversity and fluency in generated captions.

Training Procedure: Train the model using the training split of the dataset. Monitor performance on the validation set to prevent overfitting.

Inference:

Beam Search or Sampling: During inference, use the trained model to generate captions for test videos. You can use techniques like beam search or sampling to generate diverse captions for each video.
Evaluation:

Use standard evaluation metrics like BLEU, METEOR, ROUGE, and CIDEr to assess the quality of generated captions. Evaluate the model on the test set to measure its performance.
Fine-Tuning:

Depending on the results, you may consider fine-tuning the model's hyperparameters or architecture to improve caption quality.
Deployment:

Once you are satisfied with the model's performance, you can deploy it for generating captions for new videos.

# Batch Processing

In [19]:
frames[0].shape

torch.Size([224, 224, 3])

In [20]:
import torch

# Assume frames is a list of preprocessed video frames
frames_tensor = torch.stack(frames)  # Convert the list to a tensor

In [21]:
from transformers import ViTFeatureExtractor, ViTModel
import torch

# Load the ViT feature extractor and model
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
model.eval()  # Set the model to evaluation mode



ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation(

# Extract video features

In [22]:
import GPUtil

def monitor_gpu():
    GPUs = GPUtil.getGPUs()
    for i, gpu in enumerate(GPUs):
        print(f"GPU {i} - GPU Name: {gpu.name}")
        print(f"  GPU Usage: {gpu.load * 100}%")
        print(f"  GPU Memory Usage: {gpu.memoryFree} MB out of {gpu.memoryTotal} MB")

monitor_gpu()


GPU 0 - GPU Name: Tesla T4
  GPU Usage: 0.0%
  GPU Memory Usage: 10463.0 MB out of 15360.0 MB


In [23]:
# Assuming frames_tensor is your tensor of preprocessed frames
with torch.no_grad():
    # Use the feature extractor to prepare the input
    inputs = feature_extractor(images=frames_tensor, return_tensors="pt")

    # Move the entire inputs dictionary to GPU
    inputs = {key: value.to("cuda") for key, value in inputs.items()}
    model = model.to("cuda")
    
    # Forward pass through the model to get the features
    outputs = model(**inputs)
    
    # Extract the features from the model's output
    features = outputs.last_hidden_state  # This contains the features

# BERT Embeddings

In [24]:
sample_json_data

{'info': {'contributor': 'Microsoft MSM group',
  'data_created': '2016-04-14 14:30:20',
  'version': '1.0',
  'description': 'This is 1.0 version of the 2016 MSR-VTT dataset.',
  'year': '2016'},
 'videos': [{'category': 9,
   'url': 'https://www.youtube.com/watch?v=9lZi22qLlEo',
   'video_id': 'video0',
   'start time': 137.72,
   'end time': 149.44,
   'split': 'train',
   'id': 0},
  {'category': 16,
   'url': 'https://www.youtube.com/watch?v=w4JM08PDEng',
   'video_id': 'video1',
   'start time': 184.33,
   'end time': 206.89,
   'split': 'train',
   'id': 1},
  {'category': 9,
   'url': 'https://www.youtube.com/watch?v=QA7KVQq9vKA',
   'video_id': 'video2',
   'start time': 31.17,
   'end time': 41.24,
   'split': 'train',
   'id': 2},
  {'category': 8,
   'url': 'https://www.youtube.com/watch?v=QFmJZ0GU6yc',
   'video_id': 'video3',
   'start time': 48.26,
   'end time': 58.51,
   'split': 'train',
   'id': 3},
  {'category': 14,
   'url': 'https://www.youtube.com/watch?v=2q-dON

In [25]:
# Assuming json_data is your JSON data
video_id_to_extract = 'video0'

# Find the dictionary for "video0" in the "videos" list
video_info = next((video for video in json_data['videos'] if video['video_id'] == video_id_to_extract), None)

if video_info:
    # Print or use the extracted information
    print("Video ID:", video_info['video_id'])
    print("Category:", video_info['category'])
    print("URL:", video_info['url'])
    print("Start Time:", video_info['start time'])
    print("End Time:", video_info['end time'])
    print("Split:", video_info['split'])
    print("ID:", video_info['id'])
else:
    print(f"Video with ID {video_id_to_extract} not found.")

Video ID: video0
Category: 9
URL: https://www.youtube.com/watch?v=9lZi22qLlEo
Start Time: 137.72
End Time: 149.44
Split: train
ID: 0


In [26]:
# Find all captions for "video0" in the "sentences" list
captions_for_video0 = [sentence['caption'] for sentence in json_data['sentences'] if sentence['video_id'] == video_id_to_extract]

# Print or use the extracted captions
if captions_for_video0:
    print("Captions for Video0:")
    for caption in captions_for_video0:
        print(caption)
else:
    print(f"No captions found for Video ID {video_id_to_extract}.")

Captions for Video0:
a car is shown
a group is dancing
a man drives a vehicle through the countryside
a man drives down the road in an audi
a man driving a car
a man is driving a car
a man is driving down a road
a man is driving in a car as part of a commercial
a man is driving
a man riding the car speedly in a narrow road
a man showing the various features of a car
a man silently narrates his experience driving an audi
a person is driving his car around curves in the road
a person telling about a car
guy driving a car down the road
man talking about a car while driving
the man drives the car
the man driving the audi as smooth as possible
a man is driving
guy driving a car down the road


In [27]:
from transformers import BertTokenizer, BertModel
import torch

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# TODO: Change this line later for being more generic
captions = captions_for_video0

# Use batch_encode_plus to tokenize and pad the captions as a batch
encoding = tokenizer.batch_encode_plus(
    captions,
    return_tensors="pt",
    padding=True,  # Pad to the maximum sequence length within the batch
    truncation=True,  # Truncate to the maximum sequence length if exceeded
    max_length=128,  # Set the maximum sequence length
    return_attention_mask=True,
)

# Forward pass through the BERT model to get the embeddings
with torch.no_grad():
    outputs = model(**encoding)

# Extract the embeddings from the BERT model's output
embeddings = outputs.last_hidden_state

In [28]:
embeddings.shape

torch.Size([20, 14, 768])

In [29]:
features.shape

torch.Size([300, 197, 768])

In [30]:
frames_tensor.shape

torch.Size([300, 224, 224, 3])

Encoding Modalities:

Encode the text (captions) using a language model like BERT to obtain text embeddings.
Extract video features using a Vision Transformer (ViT) or similar model to obtain video embeddings.
Attention Mechanism:

Use an attention mechanism (e.g., self-attention or cross-modal attention) to compute attention scores that capture the relationships between the text and video embeddings.
Combine the text and video embeddings based on these attention scores. This step typically involves weighted summation or concatenation.
Decoding and Caption Generation:

Feed the combined embeddings into a captioning model (e.g., an LSTM or transformer-based decoder) to generate captions.

For the MSR-VTT dataset specifically, you might consider starting with an architecture like "Show, Attend, and Tell (SAT)," which combines visual attention with text decoding. This architecture has been successful for video captioning tasks and uses an attention mechanism to focus on relevant video regions while generating captions.



# SAT Model

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SATModel(nn.Module):
    def __init__(self, video_feature_dim, hidden_dim, num_words):
        super(SATModel, self).__init__()

        # Video Feature Module (e.g., Vision Transformer)
        self.video_feature_dim = video_feature_dim
        self.video_feature_extractor = nn.Sequential(
            nn.Linear(video_feature_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

        # LSTM Decoder with Attention
        self.lstm = nn.LSTMCell(hidden_dim, hidden_dim)
        self.attention_linear = nn.Linear(hidden_dim, self.video_feature_dim)  # Automatically determine the video feature dimension
        self.context_linear = nn.Linear(self.video_feature_dim, hidden_dim)

        # Caption Generation Module
        self.caption_linear = nn.Linear(hidden_dim, num_words)

    def forward(self, video_features, captions):
        batch_size, max_seq_length, _ = captions.size()
        _, num_video_frames, _ = video_features.size()  # Automatically determine the number of video frames

        # Initialize hidden states
        h = torch.zeros(batch_size, self.lstm.hidden_size).to(captions.device)
        c = torch.zeros(batch_size, self.lstm.hidden_size).to(captions.device)

        # Lists to store generated captions
        generated_captions = []

        # Loop through time steps
        for t in range(max_seq_length):
            # LSTM input: context vector
            lstm_input = h

            # LSTM step
            h, c = self.lstm(lstm_input, (h, c))

            # Compute attention scores
            attention_scores = F.softmax(self.attention_linear(h), dim=1)

            # Calculate context vector
            context_vector = torch.bmm(attention_scores.unsqueeze(1), video_features)
            context_vector = self.context_linear(context_vector.squeeze(1))

            # Combine LSTM output and context vector
            combined_output = h + context_vector

            # Generate captions
            caption_scores = self.caption_linear(combined_output)
            generated_captions.append(caption_scores)

        # Stack caption scores into a tensor
        generated_captions = torch.stack(generated_captions, dim=1)

        return generated_captions

# Initialize the SAT model
video_feature_dim = features.size(0)  # Automatically determine the video feature dimension
hidden_dim = 512  # Adjust as needed
num_words = 10000  # Adjust based on your vocabulary size

model = SATModel(video_feature_dim, hidden_dim, num_words)

# Define your loss and optimizer for training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Forward pass with your video features and embeddings
video_features = features  # Assuming you have the video features
captions = embeddings  # Assuming you have the embeddings for captions

outputs = model(video_features, captions)

# Compute loss and backpropagate
loss = criterion(outputs.view(-1, num_words), captions.view(-1))
optimizer.zero_grad()
loss.backward()
optimizer.step()


IndexError: index out of range in self

In [33]:
embeddings.size()

torch.Size([20, 14, 768])

In [34]:
features.size()

torch.Size([300, 197, 768])

In [40]:
embed = nn.Embedding(14, 768)

In [None]:
for t in range(max_seq_length):
            # LSTM input: previous word embedding and context vector
            word_embed = self.word_embeddings(captions[:, t, :])

In [44]:
embed(captions[0])

IndexError: index out of range in self

In [45]:
# Assuming you have BERT embeddings of shape [20, 14, 768]
bert_embeddings = torch.randn(20, 14, 768)

# Define the vocabulary size and embedding dimension
vocab_size = 1000  # Adjust this according to your use case
embedding_dim = 300  # Adjust this according to your use case

# Create an nn.Embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)



In [46]:
# Assuming you want to "embed" the BERT embeddings
# Reshape the BERT embeddings to match the expected shape for nn.Embedding
bert_embeddings_flat = bert_embeddings.view(-1, 768)

bert_embeddings_flat

tensor([[ 0.2598, -0.4100, -2.0121,  ..., -0.1620,  0.3682,  1.7746],
        [ 1.0279, -0.7828, -0.5770,  ..., -0.2357,  0.6445,  0.4374],
        [-0.3376,  1.3353,  0.9249,  ...,  1.5208,  1.4703,  2.5637],
        ...,
        [-2.3290,  0.5772,  0.6292,  ...,  0.3657,  0.0304,  0.6979],
        [ 0.1277, -1.3416,  1.1254,  ...,  0.4654,  0.2355,  0.1187],
        [-0.4762, -1.2175,  2.0629,  ...,  0.9348, -1.2710, -0.1460]])

In [49]:
bert_embeddings_flat.long()

tensor([[ 0,  0, -2,  ...,  0,  0,  1],
        [ 1,  0,  0,  ...,  0,  0,  0],
        [ 0,  1,  0,  ...,  1,  1,  2],
        ...,
        [-2,  0,  0,  ...,  0,  0,  0],
        [ 0, -1,  1,  ...,  0,  0,  0],
        [ 0, -1,  2,  ...,  0, -1,  0]])

https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning/tree/master

In [50]:
# Pass the reshaped BERT embeddings through the nn.Embedding layer
embedded_result = embedding_layer(bert_embeddings_flat.long())

# Now, the embedded_result will be of shape [20 * 14, 768]
# You can reshape it back to [20, 14, 768] if needed
embedded_result = embedded_result.view(20, 14, 768)

IndexError: index out of range in self

>Caption Embedding at input is so useless

https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning/tree/master

In [1]:
bert_embeddings_flat.shape

NameError: name 'bert_embeddings_flat' is not defined

# Working Model Code here

In [50]:
pooled_embeddings = torch.mean(embeddings, dim=0)

In [51]:
pooled_embeddings.shape

torch.Size([14, 768])

In [52]:
pooled_features = torch.mean(features, dim=0)

In [53]:
pooled_features.shape

torch.Size([197, 768])

In [54]:
# Calculate the mean of 'pooled_embeddings' along axis 1 to match the dimension of 'pooled_features'
pooled_embeddings_reduced = torch.mean(pooled_embeddings, dim=-2, keepdim=True)  # Resulting shape: [1, 1, 768]

In [55]:
pooled_features_reduced = torch.mean(pooled_features, dim=-2, keepdim=True)

In [56]:
pooled_embeddings_reduced.shape, pooled_features_reduced.shape

(torch.Size([1, 768]), torch.Size([1, 768]))

In [57]:
import torch.nn.functional as F

In [61]:
device = "cuda"
pooled_features_reduced = pooled_features_reduced.to(device)
pooled_embeddings_reduced = pooled_embeddings_reduced.to(device)

In [62]:
cosine_sim = F.cosine_similarity(pooled_features_reduced, pooled_embeddings_reduced, dim=-1)
cosine_sim

tensor([-0.0116], device='cuda:0')

In [70]:
# Define temperature parameter
tau = 1  # You can adjust this value based on your problem

In [71]:
# Apply softmax with temperature to both sets of embeddings
softmax_pooled_features = F.softmax(pooled_features_reduced / tau, dim=1)
softmax_pooled_embeddings = F.softmax(pooled_embeddings_reduced / tau, dim=1)

In [72]:
# Calculate the cross-entropy loss
loss = -torch.sum(softmax_pooled_embeddings * torch.log(softmax_pooled_features + 1e-10)) / softmax_pooled_embeddings.size(0)

# Print or use the loss as needed
print("Cross-Entropy Loss:", loss.item())

Cross-Entropy Loss: 6.652477741241455


# Full training code from here

In [75]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [82]:
class VideoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(VideoEncoder, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
    
    def forward(self, video_features):
        # video_features: (batch_size, seq_len, input_size)
        output, _ = self.rnn(video_features)
        return output


from transformers import ViTFeatureExtractor, ViTModel
import torch
import torch.nn as nn

class VideoEncoder(nn.Module):
    def __init__(self, pretrained_model_name, hidden_size):
        super(VideoEncoder, self).__init__()
        self.feature_extractor = ViTFeatureExtractor.from_pretrained(pretrained_model_name)
        self.vit_model = ViTModel.from_pretrained(pretrained_model_name)
        self.hidden_size = hidden_size
    
    def forward(self, video_frames):
        # video_frames: (batch_size, num_frames, channels, height, width)
        
        batch_size, num_frames, _, _, _ = video_frames.size()
        
        # Reshape video_frames to (batch_size * num_frames, channels, height, width)
        video_frames = video_frames.view(-1, *video_frames.shape[2:])
        
        # Extract features using ViT
        inputs = self.feature_extractor(images=video_frames, return_tensors="pt")
        inputs = {key: value.to(video_frames.device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = self.vit_model(**inputs)
        
        # Extract the features from the model's output
        features = outputs.last_hidden_state  # (batch_size * num_frames, seq_len, hidden_size)
        
        # Reshape features to (batch_size, num_frames, seq_len, hidden_size)
        features = features.view(batch_size, num_frames, *features.shape[1:])
        
        return features


In [83]:
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, batch_first=True)
    
    def forward(self, captions):
        # captions: (batch_size, seq_len)
        embedded = self.embedding(captions)
        output, _ = self.rnn(embedded)
        return output

In [79]:
class VideoCaptioningModel(nn.Module):
    def __init__(self, video_encoder, text_encoder):
        super(VideoCaptioningModel, self).__init__()
        self.video_encoder = video_encoder
        self.text_encoder = text_encoder

    def forward(self, video_features, captions):
        video_encoded = self.video_encoder(video_features)
        text_encoded = self.text_encoder(captions)
        similarity = sim_func(video_encoded, text_encoded)
        return similarity

    

In [80]:
import os
import json
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Define a custom dataset class for video-caption pairs
class VideoCaptionDataset(Dataset):
    def __init__(self, json_path, video_folder, transform=None):
        self.video_folder = video_folder
        self.transform = transform
        self.data = self.load_json_data(json_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_info = self.data[idx]
        video_id = video_info['video_id']
        video_path = os.path.join(slef.video_folder, f'{video_id}.mp4')
        captions = video_info['captions']

        # Load video frames and apply transformations
        video_frames = self.load_video_frames(video_path)

        if self.transform:
            video_frames = [self.transform(frame) for frame in video_frames]

        return video_frames, captions

    def load_json_data(self, json_path):
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)
        return data

    def load_video_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(videopath)

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            # Convert frame to PIL image
            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.BGR2RGB))
            frames.append(frame_pil)

        cap.release()
        return frames

# Define transformations for video frames (you can customize these)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Define paths and create data loaders for training and validation
json_path = 'train_val_annotation/train_val_videodatainfo.json'  # Path to your JSON file
video_folder = 'TrainValVideo'  # Path to the folder containing video files

dataset = VideoCaptionDataset(json_path, video_folder, transform=transform)

# Split the dataset into training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 32  # Adjust as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [81]:
model = VideoCaptioningModel(video_encoder, text_encoder)
criterion = nn.MSELoss()  # You can use any suitable loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

NameError: name 'video_encoder' is not defined

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in dataloader_train:
        video_features, captions = batch
        optimizer.zero_grad()

        similarity = model(video_features, captions)
        loss = criterion(similarity, your_target_similarity)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(dataloader_train)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}')

# Add validation loop and early stopping as needed


In [1]:
import os
import pandas as pd

In [7]:
df = pd.read_csv("2023-06-30_server_logs.csv")
df

Unnamed: 0,prompt,user_id,login_id,Response
0,"""\nI am trying to write parsers for juniper/sr...","""#user_id#""","""#login_id#""",Based on the code and the profiling results y...
1,"""\nThe bounty expires in 2 days. Answers to th...","""#user_id#""","""#login_id#""",This is likely due to the fact that the Djang...
2,"""I'm working with an API that streams real-tim...","""#user_id#""","""#login_id#""",It seems that your issue comes from the fact ...
3,"""I've been trying to run layout parser example...","""#user_id#""","""#login_id#""",It seems like you are trying to run a layout ...
4,"""I am trying to spawn a couple of process usin...","""#user_id#""","""#login_id#""",There are two issues with your code that I ca...
5,"""write a dfs function\n""","""#user_id#""","""#login_id#""","Sure, here's an example of a depth-first sear..."
6,"""sakm\nkmasla\n""","""#user_id#""","""#login_id#""","The text ""sakm\nkmasla\n"" appears to be just ..."


In [8]:
!pip install faker



In [9]:
from faker import Faker

In [10]:
# Create a Faker object to generate random IP addresses
fake = Faker()

In [12]:
num_rows = len(df)

In [13]:
ip_addresses = [fake.ipv4() for _ in range(num_rows)]

In [14]:
df["ips"] = ip_addresses

In [15]:
df

Unnamed: 0,prompt,user_id,login_id,Response,ips
0,"""\nI am trying to write parsers for juniper/sr...","""#user_id#""","""#login_id#""",Based on the code and the profiling results y...,85.158.138.88
1,"""\nThe bounty expires in 2 days. Answers to th...","""#user_id#""","""#login_id#""",This is likely due to the fact that the Djang...,53.114.88.50
2,"""I'm working with an API that streams real-tim...","""#user_id#""","""#login_id#""",It seems that your issue comes from the fact ...,149.66.248.241
3,"""I've been trying to run layout parser example...","""#user_id#""","""#login_id#""",It seems like you are trying to run a layout ...,187.199.92.192
4,"""I am trying to spawn a couple of process usin...","""#user_id#""","""#login_id#""",There are two issues with your code that I ca...,45.167.245.49
5,"""write a dfs function\n""","""#user_id#""","""#login_id#""","Sure, here's an example of a depth-first sear...",17.31.73.73
6,"""sakm\nkmasla\n""","""#user_id#""","""#login_id#""","The text ""sakm\nkmasla\n"" appears to be just ...",182.162.3.166


In [16]:
df.to_csv("sample.csv", index=False)