This notebook is part of an undergraduate project on multilingual video captioning. It demonstrates the system pipeline, model design, and evaluation logic. Results are discussed qualitatively due to the multilingual and generative nature of the task. The code is shared for transparency and educational purposes.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd "/content/drive/MyDrive/Capstone"

# **Data Preprocessing**

# **Part 1: Using Whisper for Transcription**

## Step 1: Install Whisper and FFmpeg

In [None]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg

# New Section

## Step 2: Loading and Transcribing the Video

In [None]:
import whisper
import subprocess


# Extracting audio from video

video_path = "/content/drive/MyDrive/Capstone/videoplayback.mp4"
audio_path = "/content/drive/MyDrive/Capstone/videoplayback.mp3"

subprocess.run(["ffmpeg", "-i", video_path, "-ar", "16000", "-ac", "1", audio_path])

# load whisper model
model = whisper.load_model("medium")

# transcribe audio
result = model.transcribe(audio_path, language="en")

# print the result
print(result["text"])

# Step 3: Saving the Transcription to a text file

In [None]:
with open("/content/drive/MyDrive/Capstone/videoplayback.txt", "w") as file:
  file.write(result["text"])

# Step 4: Save Transcription to a JSON File with Timestamps

In [None]:
import json

# saving the result with timestamps as JSON
with open("/content/drive/MyDrive/Capstone/videoplayback.json", "w") as file:
  json.dump(result, file, indent=4)

# Step 5: Saving the Transcription in a Subtitle Format

In [None]:
with open("/content/drive/MyDrive/Capstone/videoplayback.srt", "w") as file:
  for i, segment in enumerate(result["segments"]):

    # Converting the starting and ending times to Subtitle format in (HH:MM:SS)
    start = segment["start"]
    end = segment["end"]
    text = segment["text"]

    # Formating time in SRT format
    start_srt = f"{int(start // 3600):02}:{int((start % 3600) // 60):02}:{int(start % 60):02}, {int((start % 1) * 1000):03}"
    end_srt = f"{int(end // 3600):02}:{int((end % 3600) // 60):02}:{int(end % 60):02}, {int((end % 1) * 1000):03}"

    # Writing to SRT file
    file.write(f"{i+1}\n")
    file.write(f"{start_srt} --> {end_srt}\n")
    file.write(f"{text}\n\n")


# **Model Building**

In [None]:
!pip install opencv-python

# **Part 2: Integrating Captions with video frames**

# Step 1: Frame Extraction and Caption Alignment

In [None]:
import cv2
import json

# Loading the transcription data
with open("/content/drive/MyDrive/Capstone/videoplayback.json", "r") as file:
  transcription_data = json.load(file)

# Defining the frame extraction interval in seconds
frame_interval = 2

# Opening the video file

video_path = "/content/drive/MyDrive/Capstone/videoplayback.mp4"
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)

# Frame counter
frames_data = []
frame_count = 0

# Processing the video and align frames with captions
while cap.isOpened():
  ret, frame = cap.read()
  if not ret:
    break

  # Extracting frame every "frame_interval" seconds
  if frame_count % int(fps * frame_interval) == 0:
    timestamp = frame_count / fps

    # Finding the corresponding caption segment based on timestamp
    caption = ""
    for segment in transcription_data["segments"]:
      if segment["start"] <= timestamp < segment["end"]:
        caption = segment["text"]
        break

    # Saving frame and corresponding caption
    frames_data.append({"frame": frame, "caption": caption, "timestamp": timestamp})

  frame_count += 1

# Releasing the video capture
cap.release()

# Example: Access first frame and its caption
print(frames_data[0]["caption"])

# Next Steps: Preparing Frames and Captions for the Model

## Step 1: Frame Preprocessing

In [None]:
import numpy as np

# Setting frame dimensions for resizing
frame_height, frame_width = 224, 224

# Preprocess frames
processed_frames = []
for data in frames_data:
  frame = data["frame"]

  # Resizing and normalizing the frame
  frame_resized = cv2.resize(frame, (frame_width, frame_height))
  frame_normalized = frame_resized / 255.0

  processed_frames.append(frame_normalized)

# Converting to numpy array for model compatibility
processed_frames = np.array(processed_frames)
print("Processed Frames Shape:", processed_frames.shape)

## Step 2: Converting Captions to mBERT Embeddings

In [None]:
!pip install tensorflow

In [None]:
!pip install tensorflow --upgrade  # For CPU version

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
import tensorflow as tf

# Ensure tensorflow-text is compatible with the installed TensorFlow version
!pip install -U transformers huggingface_hub safetensors
!pip install tf-keras tensorflow-text==2.19.0 # Explicitly install compatible tensorflow-text

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Loading mBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Explicitly load from PyTorch weights and disable safetensors
model = TFBertModel.from_pretrained("bert-base-multilingual-cased", from_pt=True, use_safetensors=False)

# Preprocessing and embed captions
embedded_captions = []
for data in frames_data:
  caption = data["caption"]

  # Tokenizing and converting to mBERT embedding
  inputs = tokenizer(caption, return_tensors="tf", padding="max_length", truncation=True, max_length=20)
  outputs = model(**inputs)

  # Use the last hidden state for the embedding
  embedding = outputs.last_hidden_state[:, 0, :]
  embedded_captions.append(embedding)

# Converting list to tensor
embedded_captions = tf.concat(embedded_captions, axis=0)
print("Embedded Captions Shape:", embedded_captions.shape)

In [None]:
tokenizer.save_pretrained("saved_models/tokenizer")

# Day 5: Creating Sequence for the **model**

## Step 1: Sequence Preparation

In [None]:
import numpy as np

# Setting the sequence length (10 frames per sequence)
sequence_length = 10

# Preparing the frame and caption sequences
frame_sequences = []
caption_sequences = []

for i in range(len(processed_frames) - sequence_length + 1):
  frame_sequence = processed_frames[i:i + sequence_length]
  caption_sequence = embedded_captions[i + sequence_length - 1]

  frame_sequences.append(frame_sequence)
  caption_sequences.append(caption_sequence)

# Converting to numpy arrays for model compatibility
frame_sequences = np.array(frame_sequences)
caption_sequences = np.array(caption_sequences)
print("Frame Sequences Shape:", frame_sequences.shape)
print("Caption Sequences Shape:", caption_sequences.shape)

## Step 2: Defining and Compiling the LSTM-based CaptioningModel

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Defining the model architecture
input_shape = (sequence_length, frame_height, frame_width, 3)

model = models.Sequential()

# Input layer for frame sequences
model.add(layers.TimeDistributed(layers.Conv2D(32, (3, 3), activation='relu'), input_shape=input_shape))
model.add(layers.TimeDistributed(layers.MaxPooling2D((2, 2))))
model.add(layers.TimeDistributed(layers.Conv2D(64, (3, 3), activation='relu')))
model.add(layers.TimeDistributed(layers.MaxPooling2D((2, 2))))
model.add(layers.TimeDistributed(layers.Flatten()))

# LSTM layer to process frame caption
model.add(layers.LSTM(256, return_sequences=False))

# Dense layer to output caption embedding
model.add(layers.Dense(768, activation='linear')) # 768 is the size of the mBERT embedding

# Compiling the model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Model summary
model.summary()

## Step 3: Training the Model

In [None]:
import os
os.makedirs("saved_models", exist_ok=True)
# Setting the training parameters

batch_size = 4 # Further reduced batch size to mitigate OOM error
epochs = 20
validation_split = 0.2

# Training the model
history = model.fit(
  frame_sequences,
  caption_sequences,
  batch_size=batch_size,
  epochs=epochs,
  validation_split=validation_split,
  shuffle=True,
)

In [None]:
model.save("saved_models/video_caption_model.h5")  # or use .h5 if preferred

In [None]:
model.save("saved_models/video_caption_model.keras")  # or use .h5 if preferred

### Plotting the Training and Validation Loss

In [None]:
import matplotlib.pyplot as plt

# Plotting the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error (MSE)')
plt.legend()
plt.show()

## Step 4: Model Evaluation
Let's evaluate the performance of the model trained.

In [None]:
# Let's assume the 'test_frame_sequences' and 'test_caption_sequences'
# are prepared similarly as training data
# Set the batch size for evaluation, same as training to prevent OOM
batch_size_eval = 4
test_loss, test_mse = model.evaluate(frame_sequences, caption_sequences, batch_size=batch_size_eval)
print(f"Test Loss (MSE): {test_loss}, Test MSE: {test_mse}")

## Step 5: Generating Captions from Prediction

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

def get_top_k_captions(embedding, reference_captions, reference_embeddings, k=3):
    # Reshape embedding to 2D if needed
    embedding = embedding.reshape(1, -1) if embedding.ndim == 1 else embedding

    # Calculate cosine similarity scores
    cosine_similarities = cosine_similarity(embedding, reference_embeddings).flatten()
    # Get top-k indices
    top_k_indices = np.argsort(cosine_similarities)[-k:][::-1]
    # Return top-k captions and their similarity scores
    top_k_captions = [reference_captions[i] for i in top_k_indices]
    top_k_scores = cosine_similarities[top_k_indices]
    return list(zip(top_k_captions, top_k_scores))

def beam_search_captions(predicted_embeddings, reference_captions, reference_embeddings, beam_width=3):
    all_generated_captions = []

    for embedding in predicted_embeddings:
        top_k_captions = get_top_k_captions(embedding, reference_captions, reference_embeddings, k=beam_width)
        all_generated_captions.append(top_k_captions)

    # Get the highest scored caption across all beams
    best_caption_sequence = max(all_generated_captions, key=lambda x: np.mean([score for _, score in x]))
    return [caption for caption, _ in best_caption_sequence]

# Generate predictions for frame sequences
predicted_embeddings = model.predict(frame_sequences)
print("Predicted Embeddings Shape:", predicted_embeddings.shape)

# Defining the reference captions and embeddings
reference_captions = [data["caption"] for data in frames_data]
reference_embeddings = embedded_captions

# Generate captions using beam search
generated_captions_beam_search = beam_search_captions(predicted_embeddings, reference_captions, reference_embeddings)

# Display the results
for i, caption in enumerate(generated_captions_beam_search[:5]):  # Displaying first 5 generated captions
    print(f"Generated Caption {i+1}: {caption}")


# **Evaluating Model-Generated Captions**

## Step 1.1: Importing libraries

In [None]:
!pip install nltk rouge-score
!pip install rouge-score

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Downloading the required NLTK data for METEOR
nltk.download('wordnet')

## Step 1.2: Preparing the Reference

## Step 1.2: Calculating BLEU, METEOR, and ROUGE Scores

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Storing scores
bleu_scores = []
meteor_scores = []
rouge_scores = []

for ref, gen in zip(reference_captions, generated_captions_beam_search):
  # Calculating BLEU score
  bleu_score = sentence_bleu([ref.split()], gen.split())
  bleu_scores.append(bleu_score)

  # Calculating METEOR score
  meteor = meteor_score([ref.split()], gen.split())
  meteor_scores.append(meteor)

  # Calculating ROUGE score
  rouge_score = scorer.score(ref, gen)
  rouge_scores.append(rouge_score)

# Displaying the average scores for an overall view
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_meteor = sum(meteor_scores) / len(meteor_scores)
avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

# Printing the average
print("Average BLEU Score:", avg_bleu)
print("Average METEOR Score:", avg_meteor)
print("Average ROUGE-1 Score:", avg_rouge1)
print("Average ROUGE-2 Score:", avg_rouge2)
print("Average ROUGE-L Score:", avg_rougeL)

In [None]:
import matplotlib.pyplot as plt

# Assuming you want to display BLEU, METEOR, ROUGE-1, ROUGE-2, and ROUGE-L scores
metric = ['BLEU', 'METEOR', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
scores = [avg_bleu, avg_meteor, avg_rouge1, avg_rouge2, avg_rougeL]

# Create bar chart
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
plt.bar(metric, scores, color=['blue', 'green', 'red', 'orange', 'purple'])

# Add labels and title
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Average BLEU, METEOR, and ROUGE Scores')

# Add value labels on top of bars
for i, v in enumerate(scores):
    plt.text(i, v + 0.01, str(round(v, 2)), ha='center', va='bottom')

# Display the chart
plt.show()