In [21]:
from IPython.display import Markdown
import time
from typing import Union
# import librosa 
from yt_dlp import YoutubeDL
from youtube_transcript_api import YouTubeTranscriptApi
# import pytube
import re
import whisper



In [2]:
# video_has_transcript = "https://www.youtube.com/watch?v=rQU75JsSSxw&ab_channel=ToxicDrunker"

# part I: get transcript 

In [3]:

def get_transcript(url: str=None, path: str=None, output_timestamp: bool=False) -> str:
  
  if url:
    transcript = check_existing_transcript(url) 
    print(transcript)

    if not transcript:
      file = download_from_youtube(url)
    else:
      return transcript

  elif path: 
    file = load_from_local_file(path)
    
  transcript = convert_speech_to_text(file)
    
  return transcript 


def get_video_id(url: str) -> str:
    """The `get_video_id` function is extracting the unique video ID from a YouTube 
    video URL. It uses a regular expression to search for the video ID pattern in 
    the URL and returns the extracted video ID as a string.

    Args:
        url (str): _description_

    Returns:
        str: _description_
    """
    # [TODO]: try and raise error
    return re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url).group(1)


def check_existing_transcript(url: str, language_code="en", manual_only=True, output_timestamp=False) -> Union[str, bool]:
  """
  `check_existing_transcript` is a function that checks if a transcript exists for 
  a given YouTube video URL. It first extracts the video ID from the URL and then 
  attempts to list the available transcripts for that video using the YouTubeTranscriptApi. 
  If the transcript is found and matches the specified language code and generation type 
  (manual or not), it retrieves the transcript text. The function also has the option to 
  output timestamps along with the transcript text. If any errors occur during the process, 
  it will return False.

  Args:
      url (str): the url of the YouTube video 
      language_code (str, optional): desired language code for the existing transcript. Defaults to "en".
      manual_only (bool, optional): if only download manual transcript. Defaults to False.
      output_timestamp (bool, optional): if output timestamp. Defaults to False.

  Returns:
      transcript: transcript in string format or False if the transcript does not exist
  """
  # [TODO] use logging to capture errors
  video_id = get_video_id(url)
  try:
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    transcript_list = {t.language_code: not t.is_generated for t in transcript_list}
    print(transcript_list)
  except Exception as e:
    print(e)
    return False
  
  if language_code in transcript_list and transcript_list[language_code] is manual_only:
    transcription_ = YouTubeTranscriptApi.get_transcript(video_id, languages=[language_code])
    if output_timestamp:
      transcript = ""
      for tr in transcription_:
        start = time.strftime("%H:%M:%S", time.gmtime(tr["start"]))
        end = time.strftime("%H:%M:%S", time.gmtime(tr["start"]))
        transcript += f"[{start}-{end}] {tr['text']}" + "\n"
    else:
      transcript = ""
      for tr in transcription_:
        transcript += tr["text"]+" "
  else:
    print(f"the specified language code {language_code} does not exist.")
    return False
    
  return transcript


def download_from_youtube(url, output_path=""):
  """_summary_

  Args:
      url (_type_): _description_
      output_path (str, optional): _description_. Defaults to "".
  """
  
  youtube_dl_opts = {
  "outtmpl": output_path+"%(title)s.%(ext)s",
  "format": "bestaudio/best",
  'postprocessors': [{
    'key': 'FFmpegExtractAudio', 
    'preferredcodec': 'mp3', 
    'preferredquality': '192'
  }]
}

  with YoutubeDL(youtube_dl_opts) as ydl:
    info_dict = ydl.extract_info(url, download=True)

  return ydl.prepare_filename(info_dict).replace("webm", "mp3")


def load_from_local_file(path: str) -> None:
  """_summary_

  Args:
      path (str): _description_

  Returns:
      _type_: _description_
  """
  # audio_file, _ = librosa.load(path)
  return path

def convert_speech_to_text(file, model_size="tiny", language="en", fp16=False, verbose=False):
  """The `convert_speech_to_text` function is responsible for converting 
  speech audio data into text. It takes the audio file as input along with 
  optional parameters such as the model size, language, whether to use 
  fp16 precision, and a verbosity flag.

  Args:
      file (_type_): _description_
      model_size (str, optional): _description_. Defaults to "tiny".
      language (str, optional): _description_. Defaults to "en".
      fp16 (bool, optional): _description_. Defaults to False.
      verbose (bool, optional): _description_. Defaults to False.

  Returns:
      _type_: _description_
  """
  model = whisper.load_model(model_size)
  result = model.transcribe(file, language=language, fp16=fp16, verbose=verbose)
  return result
  

In [6]:
# sanitary check 

# url = "https://www.youtube.com/watch?v=B3szaVzQx0o&ab_channel=MarquesBrownlee" # has manual transcript 
# url = "https://www.youtube.com/watch?v=4COqwI5-YFA&ab_channel=TheStraitsTimes" # singlish 
# url = "https://www.youtube.com/watch?v=b5XgNrkccxc&ab_channel=JaredHenderson"


# url = "https://www.youtube.com/watch?v=rZvhFcA-n5c&ab_channel=SNARLED"` is a commented-out line of code. This means that it is not currently being executed as part of the program. It is likely being used as a reference or placeholder for a YouTube video URL that may be used in the future.
url = "https://www.youtube.com/watch?v=rZvhFcA-n5c&ab_channel=SNARLED"

tr = get_transcript(url)


{'en': False}
the specified language code en does not exist.
False
[youtube] Extracting URL: https://www.youtube.com/watch?v=rZvhFcA-n5c&ab_channel=SNARLED
[youtube] rZvhFcA-n5c: Downloading webpage
[youtube] rZvhFcA-n5c: Downloading ios player API JSON
[youtube] rZvhFcA-n5c: Downloading android player API JSON




[youtube] rZvhFcA-n5c: Downloading m3u8 information
[info] rZvhFcA-n5c: Downloading 1 format(s): 251
[download] Hamilton Musical Explained ⧸⧸ 3 Minutes Or Less ｜ Snarled.webm has already been downloaded
[download] 100% of    2.73MiB
[ExtractAudio] Destination: Hamilton Musical Explained ⧸⧸ 3 Minutes Or Less ｜ Snarled.mp3
Deleting original file Hamilton Musical Explained ⧸⧸ 3 Minutes Or Less ｜ Snarled.webm (pass -k to keep)


100%|██████████| 16422/16422 [00:10<00:00, 1521.19frames/s]


In [8]:
tr["text"]

" Hi, I'm Sabrina and I love Hamilton and American musical, but it's three hours long so I'm gonna compress it into three minutes. Let's go. There's this bastard orphan son of a hornist Scott with a who lives a super depressing life, but he escapes poverty by educating himself and writing a letter in earning a scholarship to King's College. Fast forward to 1776, Hamilton graduates and is now stalking his idol Aaron Burr, who just tells him to shut up and smile, then John Lawrence, her Achilles Mulligan and the Marquis to left by Ed Walkin. In Hamilton is like, I have found my people. And they also are yelling about starting our evolution while Burr's in the corner judging them. But the Revolutionary War starts in general George Washington needs a right hand man. Burr walks in his like, mate, you should pick me and Washington's like, uh, I pick Hamilton. Then in 1780 when there's ball burrs, like, at least there's one thing Hamilton and I are both equal in picking up the ladies. You see

In [None]:
# Markdown(f["text"])

In [204]:
# # sanitary check 

# # url = "https://www.youtube.com/watch?v=B3szaVzQx0o&ab_channel=MarquesBrownlee" # has manual transcript 
# # url = "https://www.youtube.com/watch?v=4COqwI5-YFA&ab_channel=TheStraitsTimes" # singlish 
# path = "Lawrence Wong sworn in as Singapore’s fourth Prime Minister.mp3"

# f = load_audio_file(path=path)
# f

In [9]:
with open("transcript.txt", "w") as file:
  file.write(tr["text"])

# part II: RAG

In [10]:
import os
from dotenv import load_dotenv, find_dotenv
import openai

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")

# from IPython.display import Markdown, display

In [11]:
from llama_index.core import SimpleDirectoryReader
# from llama_index.core import Document

# load the document using `SimpleDirectoryReader`
# which supports a range of document types based on 
# their extension
document = SimpleDirectoryReader(
  input_files=["transcript.txt"]
).load_data()[0]

In [12]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
  window_size=2,
  window_metadata_key="window",
  original_text_metadata_key="original_text"
)

In [13]:
# # another alternative is to build base nodes 
# # using `SentenceSplitter`

# from llama_index.core.node_parser import SentenceSplitter

# # base node parser is a sentence splitter
# text_splitter = SentenceSplitter()

In [14]:
# building the index
from llama_index.llms.openai import OpenAI
from llama_index.core import ServiceContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_context = ServiceContext.from_defaults(
  llm=llm,
  embed_model=HuggingFaceEmbedding("BAAI/bge-small-en-v1.5"),
  node_parser=node_parser
)

  sentence_context = ServiceContext.from_defaults(


In [15]:
from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
  [document],
  service_context=sentence_context
)


In [16]:
# building the postprocessor:
# use the `MetadataReplacementPostProcessor` to replace the sentence in each node 
# with its surrounding context

from llama_index.core.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
  similarity_top_k=2,
  node_postprocessors=[
    MetadataReplacementPostProcessor(target_metadata_key="window")
  ]
)

In [19]:
query = "summarize the video"

window_response = query_engine.query(
  query
)

print(window_response)

The video provides a glimpse into the conflicts and challenges faced by various characters during a historical period, including duels, appointments, and betrayals, ultimately leading to victories despite the obstacles encountered.


In [25]:
Markdown(window_response.response)

The video provides a glimpse into the conflicts and challenges faced by various characters during a historical period, including duels, appointments, and betrayals, ultimately leading to victories despite the obstacles encountered.

In [27]:
def query_sentence_window_rag(query: str, api_key: str, file_path: str) -> str:
  
  openai.api_key = api_key
  
  document = SimpleDirectoryReader(input_files=[file_path]).load_data()[0]
  
  node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=2,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
  )
  
  llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

  sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=HuggingFaceEmbedding("BAAI/bge-small-en-v1.5"),
    node_parser=node_parser
  )
  
  sentence_index = VectorStoreIndex.from_documents(
    [document],
    service_context=sentence_context
  )
  
  query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    node_postprocessors=[
      MetadataReplacementPostProcessor(target_metadata_key="window")
    ]
  )
  
  window_response = query_engine.query(query)
  
  return window_response.response

# part III: putting it all together

In [28]:
url = "https://www.youtube.com/watch?v=rZvhFcA-n5c&ab_channel=SNARLED"

tr = get_transcript(url)

{'en': False}
the specified language code en does not exist.
False
[youtube] Extracting URL: https://www.youtube.com/watch?v=rZvhFcA-n5c&ab_channel=SNARLED
[youtube] rZvhFcA-n5c: Downloading webpage
[youtube] rZvhFcA-n5c: Downloading ios player API JSON
[youtube] rZvhFcA-n5c: Downloading android player API JSON




[youtube] rZvhFcA-n5c: Downloading m3u8 information
[info] rZvhFcA-n5c: Downloading 1 format(s): 251
[download] Destination: Hamilton Musical Explained ⧸⧸ 3 Minutes Or Less ｜ Snarled.webm
[download] 100% of    2.73MiB in 00:00:01 at 2.09MiB/s   


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[ExtractAudio] Destination: Hamilton Musical Explained ⧸⧸ 3 Minutes Or Less ｜ Snarled.mp3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Deleting original file Hamilton Musical Explained ⧸⧸ 3 Minutes Or Less ｜ Snarled.webm (pass -k to keep)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 16422/16422 [00:14<00:00, 1165.41frames/s]


In [29]:
with open("transcript.txt", "w") as file:
  file.write(tr["text"])

In [31]:
q = "summarize the video"
_ = load_dotenv(find_dotenv())
api_key = os.getenv("OPENAI_API_KEY")
path = "transcript.txt"

answer = query_sentence_window_rag(query=q, api_key=api_key, file_path=path)

  sentence_context = ServiceContext.from_defaults(


In [34]:
Markdown(answer)

The video discusses the challenges faced by Hamilton's son Philip, who engages in a duel to defend his father's honor. Additionally, it highlights the sacrifices made by Angelica for her sister Eliza, the struggles of General Charles Lee during the war, and the eventual victory achieved with the help of Hamilton and others.