<h1>Video Companion Guide Challenge</h1>
<h2>Submission by: Adam Łucek</h2>
<h4>Automating the creation and media population of markdown companion guides with ✨AI✨</h4>
<div>
<img src="tweet.png" width="500"/>
</div>
<hr></hr>


### Dependencies
---

In [35]:
from pytube import YouTube as pyt
from moviepy.editor import *
import os
from moviepy.editor import AudioFileClip
from openai import OpenAI
import time
import json
import re
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
import cv2

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/bin/ffmpeg" 

### Time Keeping Function
---

In [22]:
def start_timer():
    global start_time
    start_time = time.time()

def show_current_runtime():
        return round(time.time() - start_time, 2)

### Downloading the Audio & Video
---

In [36]:
def download_video(url, path):
    yt = pyt(url)
    stream = yt.streams.get_highest_resolution()
    stream.download(output_path=f"{path}/original_files/video", filename="video_file.mp4")
    audio_stream = yt.streams.filter(only_audio=True).first()
    audio_stream.download(output_path=f"{path}/original_files/audio", filename="audio_file.mp4")

def get_title(url):
    yt = pyt(url)
    return yt.title

### Checking Audio File Size, and Chunking it if Large
---

In [37]:
def split_audio(file_path, chunk_size_mb=12, output_folder="split_chunks"):
    global split_audio_return
    
    if not os.path.exists(file_path):
        print(f"File {file_path} not found.")
        return

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"File size: {file_size_mb:.2f} MB")

    if file_size_mb <= chunk_size_mb:
        print("File size is within the limit. No need to split.")
        return
    else:
        split_audio_return = True

    clip = AudioFileClip(file_path)
    total_duration = clip.duration
    chunk_duration = (chunk_size_mb / file_size_mb) * total_duration

    # Split the audio
    start = 0
    part = 1
    while start < total_duration:
        end = min(start + chunk_duration, total_duration)
        chunk = clip.subclip(start, end)
        chunk_filename = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(file_path))[0]}_part{part}.mp4")
        chunk.write_audiofile(chunk_filename, bitrate="64k", codec="aac")

        print(f"Created chunk: {chunk_filename}")

        start = end
        part += 1

    clip.close()

### Transcribing with Whisper-1 & Writing to JSON File(s)
---

In [38]:
def create_json(split_audio_return, input_folder, output_folder="transcript_json"):
    if not os.path.exists(input_folder):
        print(f"The folder {input_folder} does not exist.")
        return
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    client = OpenAI()

    if not split_audio_return:
        # If audio is not split, use the path to the original file
        original_audio_path = f"{input_folder}/original_files/audio"        
        if os.path.exists(original_audio_path):
            for filename in os.listdir(original_audio_path):
                if filename.endswith(".mp4"):
                    process_audio_file(client, original_audio_path, filename, output_folder)
        else:
            print(f"The original audio folder {original_audio_path} does not exist.")
    else:
        input_folder_chunks = f'{path}/split_chunks'
        # If audio is split, iterate over the split audio files
        for filename in os.listdir(f'{path}/split_chunks'):
            if filename.endswith(".mp4"):
                process_audio_file(client, input_folder_chunks, filename, output_folder)

def process_audio_file(client, folder_path, filename, output_folder):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            file=audio_file,
            model="whisper-1",
            response_format="verbose_json",
            timestamp_granularities=["segment"]
        )

        json_filename = f"{os.path.splitext(filename)[0]}_transcript.json"
        output_path = os.path.join(output_folder, json_filename)

        with open(output_path, 'w') as f:
            json.dump(transcript.segments, f, indent=4)

        print(f"Transcript for {filename} saved to {output_path}")

### Cleaning & Concatenating the Transcription JSON files
---

In [39]:
def extract_part_number(filename):
    match = re.search(r'part(\d+)', filename)
    return int(match.group(1)) if match else 0

def process_file(filepath, max_id, last_end_time):
    with open(filepath, 'r') as file:
        data = json.load(file)
        time_adjustment = last_end_time - float(data[0]['start']) if last_end_time else 0
        new_data = []
        for entry in data:
            new_entry = {
                'id': max_id + 1,
                'start': float(entry['start']) + time_adjustment,
                'end': float(entry['end']) + time_adjustment,
                'text': entry['text']
            }
            new_data.append(new_entry)
            max_id += 1
        return new_data, max_id, new_data[-1]['end'] if new_data else last_end_time

def process_transcripts(directory_path, split_audio_return):
    files = os.listdir(directory_path)
    combined_data = []
    max_id = -1
    last_end_time = 0.0

    if split_audio_return:
        sorted_files = sorted(
            [file for file in files if file.startswith('audio') and file.endswith('.json')],
            key=extract_part_number
        )
    else:
        sorted_files = [file for file in files if file.endswith('.json') and not 'part' in file]

    for filename in sorted_files:
        full_path = os.path.join(directory_path, filename)
        processed_data, max_id, last_end_time = process_file(full_path, max_id, last_end_time)
        combined_data.extend(processed_data)

    # Output the combined data to a new JSON file
    output_path = os.path.join(directory_path, 'combined_data.json')
    with open(output_path, 'w') as outfile:
        json.dump(combined_data, outfile, indent=4)



### Also Grabbing the Full Transcript
---

In [40]:
def full_transcript(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    transcript = ""
    for entry in data:
        transcript += entry['text']
    return transcript

### Function to Combine Short Chunks of Transcription
---

In [41]:
def merge_short_documents(documents, min_length=2000):
    i = 0
    while i < len(documents) - 1:
        current_doc = documents[i]
        if len(current_doc.page_content) < min_length:
            documents[i + 1].page_content = current_doc.page_content + documents[i + 1].page_content
            del documents[i]
        else:
            i += 1
    return documents

### Main Markdown File Prompting with GPT-4-Turbo & LangChain Setup
---

In [42]:
guide_prompt_template = """

Below is a script from a video that I am making into a companion guide blog post first. \
You are a helpful assistant made to assist in the creation I'm doing. \
This is a continuation of a guide so include chapters, key summaries, and incorporate visual aids and direct links to relevant parts of the video, \
however do not include any conclusion or overarching title. \
For visual aids, specific frames from the video will be identified where images can be inserted to enhance understanding. \
For direct links, portions of the text should be hyperlinked to their corresponding times in the video. \
To indicate that a sentence should be hyperlinked, insert the raw text of the transcript next to the word with the indicator <HYPERLINK: "corresponding transcript text">. \
To indicate a picture regarding the text, insert the indicator <PICTURE: "corresponding transcript text">. \
It is crucial to use the raw text from the transcript that will be used, as the additional tools that will be inserting the hyperlinks and pictures need this to know where in the video to look.

In this blog post, in addition to the paragraphs: \

Create titles or headings that encapsulate main points and ideas \

Format your response in markdown, ensuring distinction and clean styling between titles and paragraphs. \
Be sure to include the image placeholders, and hyperlinks with enough distinguishable text WITHOUT ANY QUOTATIONS, as the placeholders will be fed into a semantic search algorithm. \
This structured approach will be applied to the entire transcript. \
The example below only shows one style, but use multiple styles including different headings, bullet points, and other markdown elements when needed. \

Here are shortened example of the input and shortened expected output:

example input:

Hi everyone. So in this video I'd like us to cover the process of tokenization in large language models. Now you see here that I have a sad face and that's because tokenization is my least favorite part of working with large language models but unfortunately it is necessary to understand in some detail because it is fairly hairy, gnarly and there's a lot of hidden foot gums to be aware of and a lot of oddness with large language models typically traces back to tokenization. So what is tokenization? Now in my previous video Let's Build GPT from Scratch we actually already did tokenization but we did a very naive simple version of tokenization. So when you go to the Google Colab for that video you see here that we loaded our training set and our training set was this Shakespeare dataset. Now in the beginning the Shakespeare dataset is just a large string in Python it's just text and so the question is how do we plug text into large language models and in this case here we created a vocabulary of 65 possible characters that we saw occur in this string. These were the possible characters and we saw that there are 65 of them and then we created a lookup table for converting from every possible character a little string piece into a token an integer. So here for example we tokenized the string hi there and we received this sequence of tokens and here we took the first 1000 characters of our dataset and we encoded it into tokens and because this is character level we received 1000 tokens in a sequence so token 18, 47, etc. Now later we saw that the way we plug these tokens into the language model is by using an embedding table and so basically if we have 65 possible tokens then this embedding table is going to have 65 rows and roughly speaking we're taking the integer associated with every single token we're using that as a lookup into this table and we're plucking out the corresponding row and this row is trainable parameters that we're going to train using backpropagation and this is the vector that then feeds into the transformer and that's how the transformer sort of perceives every single token. So here we had a very naive tokenization process that was a character level tokenizer

example output:

Introduction to Tokenization
----------------------------

Welcome to our comprehensive guide on tokenization in large language models (LLMs). Tokenization is a critical yet complex aspect of working with LLMs, essential for understanding how these models process text data. Despite its challenges, tokenization is foundational, as it converts strings of text into sequences of tokens, small units of text that LLMs can manage more effectively.

<PICTURE: Now you see here that I have a sad face and that's because tokenization is my least favorite part of working with large language models but unfortunately it is necessary to understand in some detail because it is fairly hairy, gnarly and there's a lot of hidden foot gums>

Understanding the Basics of Tokenization
----------------------------------------

Tokenization involves creating a vocabulary from all unique characters or words in a dataset and converting each into a corresponding integer token. This process was briefly introduced in our "Let's Build GPT from Scratch" video, where we tokenized a Shakespeare dataset at a character level, creating a vocabulary of 65 possible characters.

<HYPERLINK: So what is tokenization? Now in my previous video Let's Build GPT from Scratch we actually already did tokenization but we did a very naive simple version of tokenization. So when you go to the Google Colab for that video you see here that we loaded>

The Role of Embedding Tables in Tokenization
--------------------------------------------

After tokenization, the next step involves using an embedding table, where each token's integer is used as a lookup to extract a row of trainable parameters. These parameters, once trained, feed into the transformer model, allowing it to perceive each token effectively.

<PICTURE: using backpropagation and this is the vector that then feeds into the transformer and that's how the transformer sort of perceives every single token. So here we had a very naive tokenization process that was a character level tokenizer>

end examples.

Here is the transcript:

{transcript}

"""

output_parser = StrOutputParser()
llm = ChatOpenAI(temperature=0.0, model="gpt-4-turbo-preview")
guide_prompt = ChatPromptTemplate.from_template(guide_prompt_template)

guide_chain = (
    {"transcript": RunnablePassthrough()} 
    | guide_prompt
    | llm
    | output_parser
)

def generate_markdown(merged_docs, path, guide_chain):
    markdown_outputs = []
    for doc in merged_docs:
        output = guide_chain.invoke(doc.page_content)
        markdown_outputs.append(output)
    combined_output = '\n\n'.join(markdown_outputs)
    with open(f'{path}/transcript_json/llm_outline.txt', 'w') as file:
        file.write(combined_output)

### Dealing with the Placeholders, Grabbing Pictures & Formatting Hyperlinks
---

In [43]:
def grab_frame(video, second):
    frames_dir = 'frames'
    if not os.path.exists(frames_dir):
        os.makedirs(frames_dir)
    cap = cv2.VideoCapture(video)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return None
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_number = round(int(second * fps))
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    if frame_number >= total_frames:
        print(f"Error: Frame number {frame_number} exceeds total frames in video.")
        cap.release()
        return None

    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
    ret, frame = cap.read()

    if not ret:
        print("Error: Could not read frame.")
        cap.release()
        return None

    frame_path = os.path.join(frames_dir, f'frame_{second}.jpg')
    cv2.imwrite(frame_path, frame)
    cap.release()

    return frame_path

def retrieve_time(segment):
    docs = retriever.get_relevant_documents(segment)
    docs_dict = json.loads(docs[0].page_content)
    start_time = docs_dict["start"]
    end_time = docs_dict["end"]
    time = (start_time + end_time) / 2
    final_time = round(time)
    return final_time

def create_hyperlink(segment, url):
    time = retrieve_time(segment)
    time_link = f"{url}&t={time}s"
    return time_link

def format_seconds_to_hms(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

def process_placeholder(placeholder):
    if placeholder.startswith("<PICTURE:"):
        description = placeholder[9:-1]
        time = retrieve_time(description)
        image_path = grab_frame(video_path, time)
        # Embed the image using markdown with a specified width
        return f'<img src="{image_path}" alt="{description}" width="450"/>'
    elif placeholder.startswith("<HYPERLINK:"):
        text = placeholder[11:-1]
        time = retrieve_time(text)
        formatted_time = format_seconds_to_hms(time)
        hyperlink = create_hyperlink(text, url)
        return f'[Jump to this part of the video: {formatted_time}]({hyperlink})'
    else:
        return placeholder

def replace_placeholders(content):
    placeholders = re.findall(r"<[^>]+>", content)
    for placeholder in placeholders:
        replacement = process_placeholder(placeholder)
        content = content.replace(placeholder, replacement, 1)
    return content

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def convert_txt(path, title, db):
    txt_file_path = f'{path}/transcript_json/llm_outline.txt'
    output_file_path = f'{path}/companion_guide.txt'
    global video_path
    video_path = f'{path}/original_files/video/video_file.mp4'
    global retriever
    retriever = db.as_retriever(search_kwargs={"k": 1})
    
    content = read_file(txt_file_path)
    updated_content = replace_placeholders(content)
    
    with open(output_file_path, 'w') as file:
        file.write(updated_content)
    
    print(f"Updated markdown content has been written to {output_file_path}")

### Main Script
---

In [32]:
# main script
start_timer()
# URL & Path of Interest
url = 'https://www.youtube.com/watch?v=zduSFxRajkE'
path = '/Users/adamlucek/Documents/Jupyter/karpathy_guide_challenge'

print(f"Downloading Video & Audio, Runtime: {show_current_runtime()}")
# download video, audio, and details
download_video(url, path)
title = get_title(url)
print(f"Video & Audio Downloaded, Runtime: {show_current_runtime()}")

print(f"Checking File Size & Splitting if Necessary, Runtime: {show_current_runtime()}")
# Check filesize, split into multiple files if needed
split_audio_return = False
split_audio(f"{path}/original_files/audio/audio_file.mp4")
print(f"Audio Checked & Split, Runtime: {show_current_runtime()}")

print(f"Processing Audio File with Whisper-1, Runtime: {show_current_runtime()}")
# Process audio files with Whisper and create JSON files of output
create_json(split_audio_return, path)
print(f"Audio Processed with Whisper-1, Runtime: {show_current_runtime()}")

print(f"Cleaning Data, Runtime: {show_current_runtime()}")
# Combine if needed, clean extra data
process_transcripts(f"{path}/transcript_json", split_audio_return)
print(f"Data Cleaned, Runtime: {show_current_runtime()}")

print(f"Pulling Full Transcript, Runtime: {show_current_runtime()}")
# Pull the full transcript
video_transcript = full_transcript(f'{path}/transcript_json/combined_data.json')
print(f"Transcript Pulled, Runtime: {show_current_runtime()}")

print(f"Chunking & Splitting Transcript, Runtime: {show_current_runtime()}")
# Embed and chunk transcript
text_splitter = SemanticChunker(OpenAIEmbeddings())
split_docs = text_splitter.create_documents([video_transcript])
merged_docs = merge_short_documents(split_docs)
print(f"Transcript Chunked, Runtime: {show_current_runtime()}")

print(f"Embedding Transcript, Runtime: {show_current_runtime()}")
# Embed documents
json_loader = JSONLoader(f"{path}/transcript_json/combined_data.json", jq_schema=".[]", text_content=False)
json_texts = json_loader.load()
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(json_texts, embeddings)
print(f"Transcript Embedded, Runtime: {show_current_runtime()}")

print(f"Generating Markdown Outline with GPT-4-T, Runtime: {show_current_runtime()}")
# Generate markdown of file with GPT-4-T
generate_markdown(merged_docs, path, guide_chain)
print(f"Markdown File Generated, Runtime: {show_current_runtime()}")

print(f"Replacing Placeholders With Pictures & Links, Runtime: {show_current_runtime()}")
# Replace placeholders with hyperlinks and pictures
convert_txt(path, title, db)
print(f"Report Finished, Runtime: {show_current_runtime()}")

Downloading Video & Audio, Runtime: 0.0
Video & Audio Downloaded, Runtime: 11.26
Checking File Size & Splitting if Necessary, Runtime: 11.26
File size: 46.61 MB
MoviePy - Writing audio in split_chunks/audio_file_part1.mp4


                                                                                

MoviePy - Done.
Created chunk: split_chunks/audio_file_part1.mp4
MoviePy - Writing audio in split_chunks/audio_file_part2.mp4


                                                                                

MoviePy - Done.
Created chunk: split_chunks/audio_file_part2.mp4
MoviePy - Writing audio in split_chunks/audio_file_part3.mp4


                                                                                

MoviePy - Done.
Created chunk: split_chunks/audio_file_part3.mp4
MoviePy - Writing audio in split_chunks/audio_file_part4.mp4


                                                                                

MoviePy - Done.
Created chunk: split_chunks/audio_file_part4.mp4
Audio Checked & Split, Runtime: 110.94
Processing Audio File with Whisper-1, Runtime: 110.94
Transcript for audio_file_part4.mp4 saved to transcript_json/audio_file_part4_transcript.json
Transcript for audio_file_part3.mp4 saved to transcript_json/audio_file_part3_transcript.json
Transcript for audio_file_part2.mp4 saved to transcript_json/audio_file_part2_transcript.json
Transcript for audio_file_part1.mp4 saved to transcript_json/audio_file_part1_transcript.json
Audio Processed with Whisper-1, Runtime: 474.22
Cleaning Data, Runtime: 474.22
Data Cleaned, Runtime: 474.24
Pulling Full Transcript, Runtime: 474.24
Transcript Pulled, Runtime: 474.24
Chunking & Splitting Transcript, Runtime: 474.24
Transcript Chunked, Runtime: 477.68
Embedding Transcript, Runtime: 477.68
Transcript Embedded, Runtime: 482.44
Generating Markdown Outline with GPT-4-T, Runtime: 482.44
Markdown File Generated, Runtime: 1278.96
Replacing Placeholder

## **Notes**
#### **Cost:**  ~$1.91 for this pass

* \$1.10 from GPT\-4\-Turbo
  * 56,546 Context Tokens \& 17,710 Generated Tokens
* \$0.80 from Whisper\-1
  * 8,016 seconds transcribed
* \$0.02 from text\-embedding\-002\-v2
  * 172,561 Context Tokens
  
#### **Time from URL to Full Report:** 21 minutes, 56 Seconds

### **Limitations: (aka my todo list)** 
* vector search can sometimes be innacurate, point towards wrong part of the video out of order with this methods.
* Inconsistency of number of pictures and hyperlinks across the board, as it's done automatically
* Unnable to input entire script at once (believe me I tried), so all the downsides that come with sequential processing
* Could be prettier 
* My own lack of programming knowledge
