# Notebook for interacting with Youtube Video Transcription API

In [1]:
from langchain_groq import ChatGroq
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings
from youtube_transcript_api import YouTubeTranscriptApi

load_dotenv()

True

In [2]:
llm = ChatGroq(
    temperature= 0.5,
    model_name="llama3-70b-8192",
    groq_api_key= os.getenv("GROQ_API_KEY")
)

In [3]:
embeddings = CohereEmbeddings(
    model="embed-english-light-v3.0",
)

In [4]:
# Create transcript df
def create_transcript_df(yt_transcript: list, yt_id: str):
    return (
        pd.DataFrame(yt_transcript)
        .assign(start_dt=lambda x: pd.to_datetime(x["start"], unit="s"))
        .set_index("start_dt")
        .resample("3min")
        .agg({"text": " ".join})
        .reset_index()
        .assign(start_dt=lambda x: x["start_dt"].dt.minute * 60)
        .assign(
            source=lambda x: "https://youtu.be/"
            + yt_id
            + "&t="
            + x["start_dt"].astype("str")
        )
        .drop(columns=["start_dt"])
    )

In [5]:
yt_ids = [
    "OtD8wVaFm6E",  # XGBoost Part 1 (of 4): Regression
    "8b1JEDvenQU",  # XGBoost Part 2 (of 4): Classification
    "ZVFeW798-2I",  # XGBoost Part 3 (of 4): Mathematical Details
    "oRrKeUCEbq8",  # XGBoost Part 4 (of 4): Crazy Cool Optimizations
]
transcript_dfs = []
for yt_id in tqdm(yt_ids, desc="Fetching transcription"):
    yt_transcript = YouTubeTranscriptApi.get_transcript(yt_id)
    transcript_dfs.append(create_transcript_df(yt_transcript, yt_id))

transcripts_df = pd.concat(transcript_dfs).reset_index(drop=True)

Fetching transcription:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
text_splitter = CharacterTextSplitter(separator=" ", chunk_size=1200, chunk_overlap=150)

yt_docs, yt_meta = [], []

for index, row in tqdm(transcripts_df.iterrows(), total=len(transcripts_df)):
    splits = text_splitter.split_text(row["text"])
    yt_docs.extend(splits)
    yt_meta.extend([{"source": row["source"]}] * len(splits))
    print(f"Split {row['source']} into {len(splits)} chunks")

yt_store = FAISS.from_texts(yt_docs, embeddings, metadatas=yt_meta)

assert len(yt_docs) == len(yt_meta)

  0%|          | 0/37 [00:00<?, ?it/s]

Split https://youtu.be/OtD8wVaFm6E&t=0 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=180 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=360 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=540 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=720 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=900 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=1080 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=1260 into 2 chunks
Split https://youtu.be/OtD8wVaFm6E&t=1440 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=0 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=180 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=360 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=540 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=720 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=900 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=1080 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=1260 into 2 chunks
Split https://youtu.be/8b1JEDvenQU&t=1440 into 1 chunks
Split ht

In [7]:
from langchain.memory import ConversationBufferMemory
from langchain import PromptTemplate

memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
)

template = """You are a chatbot having a conversation with a human.
    Given the following extracted parts of a long document and a question,
    create a final answer.
    {context}
    {chat_history}
    Human: {question}
    Chatbot:"""

question_prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"], template=template
)

In [8]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_llm(
    llm=llm,
    retriever=yt_store.as_retriever(k=3),
    memory=memory,
    question_prompt=question_prompt,
)

In [10]:
# Use here either yt_ts_store or ys_store depending if you like to use source with or without timestamps

result = chain(
    {
        "question": "What is the difference in building a tree for a regression case compared to a classification case?"
    },
    return_only_outputs=True,
)

result

{'answer': 'FINAL ANSWER: The difference in building a tree for a regression case compared to a classification case lies in the calculation of the quality score or similarity score, loss function, and the type of problem being solved. In regression, the similarity score is calculated as the sum of the residuals squared over the number of residuals plus lambda (a regularization parameter), whereas in classification, the quality score or similarity score would likely be calculated differently, possibly using metrics such as Gini impurity or information gain. Additionally, the denominator in the calculation differs between regression and classification cases, with classification involving previously predicted probabilities and regression involving previously predicted values or residuals.\n\n',
 'sources': 'https://youtu.be/OtD8wVaFm6E&t=180, https://youtu.be/8b1JEDvenQU&t=0, https://youtu.be/8b1JEDvenQU&t=1260, https://youtu.be/ZVFeW798-2I&t=180'}