In [105]:
import torch
import os
from  openai import OpenAI
from dotenv import load_dotenv
import re
from youtube_transcript_api import YouTubeTranscriptApi

In [4]:
# load env variables 

def load_env_vars():
    load_dotenv()

    global OPENAI_API_KEY
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    if not OPENAI_API_KEY:
        raise ValueError("openai api key not found.")


load_env_vars()

In [84]:
def get_user_input():
    url_in = input("Enter URL of youtube link you wish to summarize:")
    return url_in

def clean_url(url):
    """
    To accept the user string, clean it, extract video ID and to return a standard link for transcript extraction.  
    """
    url = url.strip()
    # handle empty url case
    patterns = [
        r'[?&]v=([a-zA-Z0-9_-]{11})',      # youtube.com/watch?v=
        r'youtu\.be/([a-zA-Z0-9_-]{11})',   # youtu.be/
        r'/embed/([a-zA-Z0-9_-]{11})',      # youtube.com/embed/
        r'/v/([a-zA-Z0-9_-]{11})',          # youtube.com/v/
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            video_id = match.group(1)
            # return ('https://www.youtube.com/watch?v=' + video_id)
            return video_id

In [85]:
url = get_user_input()
video_id = clean_url(url)

print(video_id)

PGUdWfB8nLg


Sample links

- https://www.youtube.com/watch?v=len22DXc8AQ
- https://www.youtube.com/watch?v=FTH4k8z7j84
- https://youtu.be/FTH4k8z7j84?si=DaKQmfE3O7nTQ8Dm&t=36
- https://youtu.be/FTH4k8z7j84?si=nP8Qey00he1MJ0Km

Dealing with the simplest form of links here with video id = last 11 chars: direct and simple

In [60]:
# test block

for i in range(len(url) - 1):
    cons_char = url[i]+url[i+1]
    if cons_char == 'v=':
        print(f"{cons_char} occurs at index {i}, {url[i+2:i+13]}")
        break
    # print(cons_char)


v= occurs at index 30, FTH4k8z7j84


In [97]:
# fetch script

def fetch_script(video_id):
    """"
    fetches the script dictionary using the link of a video 
    """
    ytapi = YouTubeTranscriptApi()
    info = ytapi.fetch(video_id= video_id)
    # info has some metadata time stamp wise from which text has to be specifically extracted and collated 
    full_text = ""

    for snippet in info.snippets:
        full_text += (" " + snippet.text)

    return full_text

In [None]:
help(YouTubeTranscriptApi)

In [98]:
transcript = fetch_script(video_id)

In [99]:
print(transcript)
print(type(transcript))
print(dir(transcript))

 when I was young my family lived overseas uh I lived in Indonesia for a few years and my mother she didn't have the money to send me where all the American kids went to school but she thought it was important for me to keep up with an American education so she decided to teach me extra lessons herself Monday through Friday but because she had to go to work the only time she could do it was at 4:30 in the morning but whenever I'd complain my mother would just give me one looks and she'd say this is no picnic for me either Buster so I know that some of you are still adjusting to being back at school but I'm here today because I have something important to discuss with you my father left my family when I was 2 years old and I was raised by a single mom who had to work and who struggled at times to pay the bills and wasn't always able to give us the things that other kids had there were times when I missed having a father in my life there were times when I was lonely and I felt like I did

So now the transcript is ready. Now time to pass it to the LLM wrapped within a prompt. 

In [None]:
def summarize_with_LLM(transcript_text: str) -> str:
    """
    Summarizes a given YouTube video transcript using the OpenAI LLM.
    The output is a structured, readable summary that serves as a proxy for watching the video.
    
    Args:
        transcript_text (str): The full cleaned transcript of the YouTube video.

    Returns:
        str: A formatted summary of the video content.
    """

    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    system_instructions = (
        "You are an assistant tasked with summarizing YouTube video transcripts.\n"
        "The listener will use your summary as a substitute for watching the full video, "
        "so extract key ideas, arguments, or narratives and present them in a clear, organized format.\n"
        "Use sections and bullet points. Format it for terminal display (no markdown).\n"
    )

    user_prompt = f"Transcript:\n{transcript_text.strip()}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_instructions},
            {"role": "user", "content": user_prompt},
        ]
    )
    # print(response) -- to get reponse format with metadata and extract relevant bits. 

    return response.choices[0].message.content.strip()


In [131]:
summarize_with_LLM(transcript)

"Key Ideas from the Transcript:\n\n1. Personal Background:\n   - The speaker reflects on their youth living in Indonesia and the struggle of being raised by a single mother.\n   - They recall feeling lonely and unfocused in school without a father figure.\n\n2. Importance of Education:\n   - Emphasizes that education is crucial for success in various careers.\n   - States that dropping out of school equates to quitting not just on oneself, but also on the country.\n   - Education shapes the future of America by developing necessary skills to address national challenges.\n\n3. Overcoming Challenges:\n   - Acknowledges that many students face obstacles, such as lack of support, financial hardships, and unsafe environments.\n   - Encourages students to overcome these challenges, emphasizing that circumstances do not dictate destiny.\n\n4. Setting Goals for Education:\n   - Urges students to set personal educational goals and commit to meeting them.\n   - Suggests goals could range from do