# Closed Captioning
Implements a feature using Langchain's image_captions.py and audio_speech_to_text.py to produce .srt files. This system will provide both subtitles and visual scene descriptions, essentially creating closed captioning.

## Imports

In [46]:
# imports for closed captioning
import cv2
import numpy as np
import os
import transformers
import numpy as np
import ffmpeg
transformers.logging.set_verbosity_error()

from langchain.document_loaders import AssemblyAIAudioTranscriptLoader
from langchain.document_loaders import ImageCaptionLoader
from langchain.document_loaders.assemblyai import TranscriptFormat
from langchain.schema import SystemMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

## Object model

In [10]:
class CaptionModel:
    def __init__(self, start_time, end_time, closed_caption):
        self.start_time = start_time
        self.end_time = end_time
        self.closed_caption = closed_caption

    def __str__(self):
        return f"start_time: {self.start_time}, end_time: {self.end_time}, closed_caption: {self.closed_caption}"

## Video model

In [5]:
class VideoModel:
    def __init__(self, start_time, end_time, image_description):
        self.start_time = start_time
        self.end_time = end_time
        self.image_description = image_description

    def __str__(self):
        return f"start_time: {self.start_time}, end_time: {self.end_time}, image_description: {self.image_description}"
    
    def get_start_time(self):
        return self.start_time
    
    def get_end_time(self):
        return self.end_time
    
    def get_image_description(self):
        return self.image_description
    
    def set_start_time(self, start_time):
        self.start_time = start_time

    def set_end_time(self, end_time):
        self.end_time = end_time
    
    def set_image_description(self, image_description):
        self.image_description = image_description

## Audio model

In [3]:
class AudioModel:
    def __init__(self, start_time, end_time, subtitle_text):
        self.start_time = start_time
        self.end_time = end_time
        self.subtitle_text = subtitle_text

    def __str__(self):
        return f"start_time: {self.start_time}, end_time: {self.end_time}, subtitle_text: {self.subtitle_text}"
    
    def get_start_time(self):
        return self.start_time
    
    def get_end_time(self):
        return self.end_time
    
    def get_subtitle_text(self):
        return self.subtitle_text
    
    def set_start_time(self, start_time):
        self.start_time = start_time

    def set_end_time(self, end_time):
        self.end_time = end_time
    
    def set_subtitle_text(self, subtitle_text):
        self.subtitle_text = subtitle_text

## Audio Speech to Text

In [None]:
audio_file = "test_data/test.mp3"

loader = AssemblyAIAudioTranscriptLoader(
    file_path=audio_file, 
    api_key="f50c08e20ecd4544b175953636f0b936", 
    transcript_format=TranscriptFormat.SUBTITLES_SRT
)

docs = loader.load()

def CreateTranscriptModel(doc):
    transcription = doc.strip().split("\n")
    times = transcription[1].split(" --> ")
    start_time = times[0].strip()
    end_time = times[1].strip()

    subtitle_text = ' '.join(transcription[2:]).strip()

    transcript_model = AudioModel(start_time, end_time, subtitle_text)

    return transcript_model

print(CreateTranscriptModel(docs[0].page_content))

## Audio Split Model

In [6]:
def convert_to_mp3(mp4_path):
    (
        ffmpeg
        .input(mp4_path)
        .output("test_data/audio.mp3", format="mp3")
        .run()
    )

def CreateTranscriptModels(doc):
    subtitles = doc.strip().split("\n\n")  # Splitting based on double newline, which separates SRT entries
    models = []

    for subtitle in subtitles:
        lines = subtitle.split("\n")
        if len(lines) >= 3:  # Checking if there are enough lines for an index, timestamp, and text
            times = lines[1].split(" --> ")
            start_time = times[0].strip()
            end_time = times[1].strip()

            subtitle_text = ' '.join(lines[2:]).strip()

            transcript_model = AudioModel(start_time, end_time, subtitle_text)
            models.append(transcript_model)

    return models

loader = AssemblyAIAudioTranscriptLoader(
    file_path="test_data/audio.mp3", 
    api_key="f50c08e20ecd4544b175953636f0b936", 
    transcript_format=TranscriptFormat.SUBTITLES_SRT
)

# Assuming loader.load() returns the full transcript in a single document
docs = loader.load()

# This will now create a list of lists of AudioModel instances
all_audio_models = [CreateTranscriptModels(doc.page_content) for doc in docs]

# Flatten the list if necessary
audio_models = [model for sublist in all_audio_models for model in sublist]

In [7]:
for audio_model in audio_models:
    print(audio_model)

start_time: 00:00:00,570, end_time: 00:00:03,966, subtitle_text: Let's make tea. Duet with me. You read red,
start_time: 00:00:04,068, end_time: 00:00:06,910, subtitle_text: I read green. Can I get you a drink?
start_time: 00:00:09,250, end_time: 00:00:10,750, subtitle_text: Tea or coffee?
start_time: 00:00:12,370, end_time: 00:00:13,920, subtitle_text: How do you take it?
start_time: 00:00:16,050, end_time: 00:00:16,800, subtitle_text: Here.
start_time: 00:00:19,810, end_time: 00:00:22,400, subtitle_text: Great work. Now let's try it again.


## Video Split to Frames

In [None]:
def frame_difference(prev_frame, curr_frame, threshold=30):
    # Compute the absolute difference between the current frame and the previous frame
    diff = cv2.absdiff(prev_frame, curr_frame)
    # Thresholding to get the binary image, where white represents significant difference
    _, thresh = cv2.threshold(diff, threshold, 255, cv2.THRESH_BINARY)
    # If there are any white pixels in thresh, the difference is significant
    return np.any(thresh)

# Initialize the video capture
capture = cv2.VideoCapture('test_data/eng_convo.mp4')
fps = capture.get(cv2.CAP_PROP_FPS)
frame_duration = 1000 / fps

video_models = []

frameNr = 0
ret, prev_frame = capture.read()
prev_frame_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY) if ret else None
prev_start_time = 0
start_time = 0

while ret:
    end_time = prev_start_time
    prev_start_time = capture.get(cv2.CAP_PROP_POS_MSEC)

    ret, frame = capture.read()
    if not ret:
        start_time = end_time + frame_duration
        break
    
    start_time = prev_start_time
    # Convert to grayscale for comparison
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Compare with the previous frame
    if frameNr == 0 or frame_difference(prev_frame_gray, frame_gray):
        end_time = capture.get(cv2.CAP_PROP_POS_MSEC)
        cv2.imwrite(f'test_data/output_frames/frame.jpg', frame)
        prev_frame_gray = frame_gray

        # Define the path to the "output_frames" folder
        folder_path = f'test_data/output_frames/'

        # List all .jpg files in the folder
        image_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".jpg")]

        # Create an instance of the ImageCaptionLoader
        loader = ImageCaptionLoader(images=image_files)

        # Load captions for the images
        list_docs = loader.load()

        video_model = VideoModel(start_time, end_time, list_docs[len(list_docs) - 1].page_content.replace("[SEP]", "").strip())
        video_models.append(video_model)

        frameNr += 1

# Release the video capture object
capture.release()

In [32]:
for video_model in video_models:
    print(video_model)

start_time: 0.0, end_time: 40.0, image_description: an image of a kitchen area in a building
start_time: 480.0, end_time: 2240.0, image_description: a young girl in various settings
start_time: 2320.0, end_time: 9320.0, image_description: footage of a woman in a sweater appears repeatedly throughout this segment
start_time: 12360.0, end_time: 14200.0, image_description: the same image of a woman in a sweater is shown at multiple intervals
start_time: 15960.0, end_time: 22360.0, image_description: multiple instances of a woman holding a cup of coffee


## Audio and Video Combination

In [8]:
# Assuming the VideoModel class definition is already in scope

caption_data = """
start_time: 0.0, end_time: 40.0, image_description: an image of a kitchen area in a building
start_time: 480.0, end_time: 2240.0, image_description: a young girl in various settings, last seen wearing a gray sweater
start_time: 2320.0, end_time: 9320.0, image_description: footage of a woman in a sweater appears repeatedly throughout this segment
start_time: 12360.0, end_time: 14200.0, image_description: the same image of a woman in a sweater is shown at multiple intervals
start_time: 15960.0, end_time: 22360.0, image_description: multiple instances of a woman holding a cup of coffee
"""

# Parsing the raw caption data and instantiating VideoModel objects
video_models = []
for line in caption_data.strip().split("\n"):
    if not line.strip():
        continue  # Skip empty lines
    parts = line.split(",")
    start_time = float(parts[0].split(":")[1].strip())
    end_time = float(parts[1].split(":")[1].strip())
    image_description = parts[2].split(":")[1].strip()

    video_model = VideoModel(start_time, end_time, image_description)
    video_models.append(video_model)

# Display the created VideoModel objects
for vm in video_models:
    print(vm)

start_time: 0.0, end_time: 40.0, image_description: an image of a kitchen area in a building
start_time: 480.0, end_time: 2240.0, image_description: a young girl in various settings
start_time: 2320.0, end_time: 9320.0, image_description: footage of a woman in a sweater appears repeatedly throughout this segment
start_time: 12360.0, end_time: 14200.0, image_description: the same image of a woman in a sweater is shown at multiple intervals
start_time: 15960.0, end_time: 22360.0, image_description: multiple instances of a woman holding a cup of coffee


In [24]:
# Initialize ChatOpenAI with the appropriate model and API key
chat = ChatOpenAI(model="gpt-4", max_tokens=4000, openai_api_key="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")

data = f"I will provide you with a subtitle from an audio track and a description of an image from the same moment in a video. Your task is to analyze whether the subtitle and the image description logically align and make sense together. If they do, just give back the image description provided. If they don't align well, creatively adjust the image description to better fit the subtitle while retaining the main idea of the image. Your response should be a suitable closed caption that combines both elements coherently if need be. Here are the details: Subtitle: {audio_model.get_subtitle_text()}, Image Description: {video_model.get_image_description()}. Based on this, provide a suitable closed caption or adjust the image description to create one that makes sense with the subtitle. Ex. subtitle: great work, now lets try again and image: Woman holding a coffee cup. These two make sense because the woman could be talking so we would want to keep [Woman holding coffee cup] as our closed caption. So you would return the same image description. Another ex. subtitle: go fetch the ball and the image is: a stick flying. Then we can change the closed caption to: the ball is being thrown or something as our closed caption. So you return this new closed caption. IMPORTANT: the output should be 'Result:text' where text is the closed caption and keep the closed caption short and concise."

# Construct the prompt for OpenAI using ChatPromptTemplate
prompt = ChatPromptTemplate(
    messages=[
        SystemMessage(
            content="Analyzing the coherence between subtitle and image description."
        ),
        HumanMessagePromptTemplate.from_template(
            "{closed_caption}"
        )
    ]
)

conversation = LLMChain(
    llm=chat,
    prompt=prompt,
    verbose=True,
)

# Get response from OpenAI using LLMChain
response = conversation({"closed_caption": data})

print(response["text"])




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Analyzing the coherence between subtitle and image description.
Human: I will provide you with a subtitle from an audio track and a description of an image from the same moment in a video. Your task is to analyze whether the subtitle and the image description logically align and make sense together. If they do, just give back the image description provided. If they don't align well, creatively adjust the image description to better fit the subtitle while retaining the main idea of the image. Your response should be a suitable closed caption that combines both elements coherently if need be. Here are the details: Subtitle: Great work. Now let's try it again., Image Description: multiple instances of a woman holding a cup of coffee. Based on this, provide a suitable closed caption or adjust the image description to create one that makes sense with the subtitle. Ex. subtitle: great work, now lets try 

In [68]:
def parse_time(s):
    """Converts a time string into seconds."""
    h, m, s = map(float, s.replace(',', '.').split(':'))
    return h * 3600 + m * 60 + s

def milliseconds_to_srt_time(ms):    
    if isinstance(ms, str) and ',' in ms:        
        return ms

    """Converts milliseconds to SRT time format 'HH:MM:SS,mmm'."""
    hours = int(ms // 3600000)
    minutes = int((ms % 3600000) // 60000)
    seconds = int((ms % 60000) // 1000)
    milliseconds = int(ms % 1000)

    return f'{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}'


def find_overlapping_video_models(audio_model, video_models):
    overlapping_models = []
    audio_start = parse_time(str(audio_model.get_start_time()))
    audio_end = parse_time(str(audio_model.get_end_time()))

    for model in video_models:
        video_start = model.get_start_time()  # Assuming already in milliseconds
        video_end = model.get_end_time()  # Assuming already in milliseconds
        if max(audio_start, video_start) < min(audio_end, video_end):
            overlapping_models.append(model)


    return overlapping_models


def validate_and_adjust_description(audio_model, video_model):

    # Initialize ChatOpenAI with the appropriate model and API key
    chat = ChatOpenAI(model="gpt-4", max_tokens=4000, openai_api_key="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")

    data = f"I will provide you with a subtitle from an audio track and a description of an image from the same moment in a video. Your task is to analyze whether the subtitle and the image description logically align and make sense together. If they do, just give back the image description provided. If they don't align well, creatively adjust the image description to better fit the subtitle while retaining the main idea of the image. Your response should be a suitable closed caption that combines both elements coherently if need be. Here are the details: Subtitle: {audio_model.get_subtitle_text()}, Image Description: {video_model.get_image_description()}. Based on this, provide a suitable closed caption or adjust the image description to create one that makes sense with the subtitle. Ex. subtitle: great work, now lets try again and image: Woman holding a coffee cup. These two make sense because the woman could be talking so we would want to keep [Woman holding coffee cup] as our closed caption. So you would return the same image description. Another ex. subtitle: go fetch the ball and the image is: a stick flying. Then we can change the closed caption to: the ball is being thrown or something as our closed caption. So you return this new closed caption. IMPORTANT: the output should be 'Result:text' where text is the closed caption and keep the closed caption short and concise. Do not include the subtitle in the result."

    # Construct the prompt for OpenAI using ChatPromptTemplate
    prompt = ChatPromptTemplate(
        messages=[
            SystemMessage(
                content="Analyzing the coherence between subtitle and image description."
            ),
            HumanMessagePromptTemplate.from_template(
                "{closed_caption}"
            )
        ]
    )

    conversation = LLMChain(
        llm=chat,
        prompt=prompt,
        verbose=True,
    )

    # Get response from OpenAI using LLMChain
    response = conversation({"closed_caption": data})

    return response["text"]

def create_caption_models(video_models, audio_models):
    caption_models = []

    # Iterate through each audio model
    for audio_model in audio_models:
        overlapping_video_models = find_overlapping_video_models(audio_model, video_models)

        if overlapping_video_models:
            # Create separate captions for each overlapping video model
            for video_model in overlapping_video_models:
                caption_text = f"[{video_model.get_image_description()}] {audio_model.get_subtitle_text()}"
                caption_model = CaptionModel(milliseconds_to_srt_time(audio_model.get_start_time()), milliseconds_to_srt_time(audio_model.get_end_time()), caption_text)
                caption_models.append(caption_model)
        else:
            # No overlapping video, use audio model's subtitle
            caption_text = f"[No video description] {audio_model.get_subtitle_text()}"
            caption_model = CaptionModel(milliseconds_to_srt_time(audio_model.get_start_time()), milliseconds_to_srt_time(audio_model.get_end_time()), caption_text)
            caption_models.append(caption_model)

    return caption_models



# Create caption models
caption_models = create_caption_models(video_models, audio_models)

# Now, caption_models contains your finalized caption models

In [69]:
# print all caption models
for caption_model in caption_models:
    print(caption_model)

start_time: 00:00:00,000, end_time: 00:00:00,003, closed_caption: [an image of a kitchen area in a building] Let's make tea. Duet with me. You read red,
start_time: 00:00:00,004, end_time: 00:00:00,006, closed_caption: [an image of a kitchen area in a building] I read green. Can I get you a drink?
start_time: 00:00:00,009, end_time: 00:00:00,010, closed_caption: [an image of a kitchen area in a building] Tea or coffee?
start_time: 00:00:00,012, end_time: 00:00:00,013, closed_caption: [an image of a kitchen area in a building] How do you take it?
start_time: 00:00:00,016, end_time: 00:00:00,016, closed_caption: [an image of a kitchen area in a building] Here.
start_time: 00:00:00,019, end_time: 00:00:00,022, closed_caption: [an image of a kitchen area in a building] Great work. Now let's try it again.


## Create sRT file

In [67]:
def format_srt_entry(index, caption_model):
    """Formats a single caption model into an SRT entry."""
    start_time = caption_model.start_time
    end_time = caption_model.end_time
    text = caption_model.closed_caption

    return f"{index}\n{start_time} --> {end_time}\n{text}\n"


def generate_srt_content(caption_models):
    """Generates the full SRT content from a list of caption models."""
    srt_entries = []
    for index, model in enumerate(caption_models, start=1):
        srt_entry = format_srt_entry(index, model)
        srt_entries.append(srt_entry)

    return '\n'.join(srt_entries)


def write_srt_file(caption_models, file_name):
    """Writes the caption models to an SRT file."""
    srt_content = generate_srt_content(caption_models)
    with open(file_name, 'w') as file:
        file.write(srt_content)

# Usage
write_srt_file(caption_models, 'test_data/output.srt')
