In [64]:
!pip install -q youtube-transcript-api pytube

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [65]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from pytube import YouTube
from dotenv import load_dotenv
import os
load_dotenv()

True

### Step 1a - Indexing (Document Ingestion)

In [66]:

def get_youtube_id(url):
    return YouTube(url=url).video_id

In [68]:
url = input("Enter your URL: ")
video_id = get_youtube_id(url)
try:
    transcript_list =YouTubeTranscriptApi.get_transcript(video_id=video_id, languages=['en'])
    transcipt = " ".join(chunk["text"] for chunk in transcript_list)
    # print(transcipt)
except TranscriptsDisabled:
    print("No caption avaiable for this video :(")

In [69]:
len(transcipt.split(" "))

5137

### Step 1b - Indexing (Text Splitting)

In [70]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
    )  
chunks = splitter.create_documents([transcipt])

In [71]:
len(chunks)

37

In [72]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [73]:
vector_Store = FAISS.from_documents(chunks, embedding=embeddings)

In [74]:
vector_Store.index_to_docstore_id

{0: 'fe5b577f-e280-4351-a234-714229e29d72',
 1: 'f39fa7da-86b4-436b-b886-fa27a7c28205',
 2: 'a66c8856-cd9c-4b80-b601-3fad3f44bb9e',
 3: 'c7562881-04fb-46ed-ac4c-8bd298bf9673',
 4: '9eb9eabb-c705-464e-8644-0909673ed570',
 5: '32b20d78-7f66-4d2d-9833-3dcd299af1a7',
 6: '0ef1e805-1c0d-4124-ae58-1b4a762dd14c',
 7: '6881b89e-a88f-4373-a4de-2d951a90fde8',
 8: '8bdddf46-24b6-4aa0-94f8-92d40f05597c',
 9: '70ac44e0-117e-476f-a94b-53deda85961b',
 10: '9e6cf75e-c385-4400-a7df-448a73d7422f',
 11: '53e5f57b-cce5-4bf6-9f7d-8a14610b0414',
 12: '431a36bf-850a-4d5a-a053-a9e04192842b',
 13: '5fa6cf6b-5d80-4a8f-b82c-3eff89f5b75d',
 14: 'ea33c1d6-0b17-4498-b395-560b18baaf05',
 15: 'd91684a0-fb7c-4b80-8bc4-b468d44d4de2',
 16: '8c4e0127-8c9e-44c1-bb24-5dd3e791df18',
 17: '0c50072b-7db1-4d5a-9fce-571a12676d6a',
 18: '9692e447-da80-4b41-847b-1593df6fd22c',
 19: '10174699-f7e7-41f7-921a-2758dccf4fc5',
 20: 'ccea4e37-1257-4d88-b42f-f212fa15d95a',
 21: 'a0cc497b-738e-42f4-a383-e634f247e996',
 22: '4221e17e-7a21-

In [76]:
vector_Store.get_by_ids(['a4c72c17-bc4b-490e-be41-ad2d25119f50'])

[Document(id='a4c72c17-bc4b-490e-be41-ad2d25119f50', metadata={}, page_content="review by Patreon supporters. A final version should be up in public in a week or two, it usually depends on how much I end up changing based on that review. In the meantime, if you want to dive into attention, and if you want to help the channel out a little bit, it's there waiting.")]

### Step 2 - Retrieval

In [77]:
retriver = vector_Store.as_retriever(search_type="similarity", kwargs={"k":4})

In [78]:
retriver

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x17fbdb410>, search_kwargs={})

In [79]:
retriver.invoke("What is transformer architecture and how it change the world?")

[Document(id='5fa6cf6b-5d80-4a8f-b82c-3eff89f5b75d', metadata={}, page_content="as we go in is that in order for this training algorithm to work well at scale, these models have to follow a certain specific format. And if you know this format going in, it helps to explain many of the choices for how a transformer processes language, which otherwise run the risk of feeling kinda arbitrary. First, whatever kind of model you're making, the input has to be formatted as an array of real numbers. This could simply mean a list of numbers, it could be a two-dimensional array, or very often you deal with higher dimensional arrays, where the general term used is tensor. You often think of that input data as being progressively transformed into many distinct layers, where again, each layer is always structured as some kind of array of real numbers, until you get to a final layer which you consider the output. For example, the final layer in our text processing model is a list of numbers represent

### Step 3 - Augmentation

In [80]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-8b",
    api_key=os.environ["GEMINI_API_KEY"]
)

In [81]:
prompt = PromptTemplate(
    template="""
    You are a helpful assistant.
    Answer ONLY from provide transcript context.
    If the context is insufficient, Just say don't know
    
    {context}
    Question: {question}
    """,
    input_variables=["context","question"]
)

In [87]:
question = "What actually pre-training mean in transformers?"
retrieve_docs = retriver.invoke(question)

In [88]:
context_text = "\n\n".join(doc.page_content for doc in retrieve_docs)

In [89]:
final_prompt = prompt.invoke({'context': context_text, "question": question})

### Step 5 - Generation

In [90]:
answer = llm.invoke(final_prompt)

In [94]:

print(answer.content)

Pretrained refers to how the model went through a process of learning from a massive amount of data, and the prefix insinuates that there's more room to fine-tune it on specific tasks with additional training.


In [97]:
from langchain_core.output_parsers import StrOutputParser
parser  = StrOutputParser()

In [98]:
chain = prompt | llm | parser 
result = chain.invoke({"context": context_text, "question": question})

In [101]:
from pprint import pprint
pprint(result)

('Pretrained refers to how the model went through a process of learning from a '
 "massive amount of data, and the prefix insinuates that there's more room to "
 'fine-tune it on specific tasks with additional training.')
