In [80]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from dotenv import load_dotenv
import os
load_dotenv()

llmm = HuggingFaceEndpoint(
    repo_id= "TinyLlama/TinyLlama-1.1B-Chat-v1.0", #"google/gemma-2-2b-it",
    task="text-generation"
)
model = ChatHuggingFace(llm = llmm)

# Install libraries

In [81]:
#!pip install -q youtube-transcript-api langchain-community langchain-openai \ faiss-cpu tiktoken python-detenv
#!pip install langchain-core

In [82]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS 
from langchain_core.prompts import PromptTemplate

### Step 1a - Indexing(Document Ingestion)

In [83]:
video_id = "LPZh9BOjkQs" #only the ID, not full URL
try:
    # If you don't care which language, this returns the "best" one
    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

    # Flatten it to plain text 
    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    print(transcript)

except TranscriptsDisabled:
    print("No captions available for this video.")    

Imagine you happen across a short movie script that describes a scene between a person and their AI assistant. The script has what the person asks the AI, but the AI's response has been torn off. Suppose you also have this powerful magical machine that can take any text and provide a sensible prediction of what word comes next. You could then finish the script by feeding in what you have to the machine, seeing what it would predict to start the AI's answer, and then repeating this over and over with a growing script completing the dialogue. When you interact with a chatbot, this is exactly what's happening. A large language model is a sophisticated mathematical function that predicts what word comes next for any piece of text. Instead of predicting one word with certainty, though, what it does is assign a probability to all possible next words. To build a chatbot, you lay out some text that describes an interaction between a user and a hypothetical AI assistant, add on whatever the use

In [84]:
transcript_list

[{'text': 'Imagine you happen across a short movie script that',
  'start': 1.14,
  'duration': 2.836},
 {'text': 'describes a scene between a person and their AI assistant.',
  'start': 3.976,
  'duration': 3.164},
 {'text': "The script has what the person asks the AI, but the AI's response has been torn off.",
  'start': 7.48,
  'duration': 5.58},
 {'text': 'Suppose you also have this powerful magical machine that can take',
  'start': 13.06,
  'duration': 3.92},
 {'text': 'any text and provide a sensible prediction of what word comes next.',
  'start': 16.98,
  'duration': 3.98},
 {'text': 'You could then finish the script by feeding in what you have to the machine,',
  'start': 21.5,
  'duration': 4.006},
 {'text': "seeing what it would predict to start the AI's answer,",
  'start': 25.506,
  'duration': 2.862},
 {'text': 'and then repeating this over and over with a growing script completing the dialogue.',
  'start': 28.368,
  'duration': 4.372},
 {'text': "When you interact with

### Step 1b - Indexing (Text Splitting)

In [85]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [86]:
len(chunks)

10

In [87]:
chunks[0]

Document(metadata={}, page_content="Imagine you happen across a short movie script that describes a scene between a person and their AI assistant. The script has what the person asks the AI, but the AI's response has been torn off. Suppose you also have this powerful magical machine that can take any text and provide a sensible prediction of what word comes next. You could then finish the script by feeding in what you have to the machine, seeing what it would predict to start the AI's answer, and then repeating this over and over with a growing script completing the dialogue. When you interact with a chatbot, this is exactly what's happening. A large language model is a sophisticated mathematical function that predicts what word comes next for any piece of text. Instead of predicting one word with certainty, though, what it does is assign a probability to all possible next words. To build a chatbot, you lay out some text that describes an interaction between a user and a hypothetical A

### Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [88]:
from langchain_huggingface import HuggingFaceEmbeddings
 
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(chunks, embeddings)

In [89]:
vector_store.index_to_docstore_id

{0: '2ed0042e-d136-43cc-95e3-edda41b98b6e',
 1: '39eba529-09d3-4877-9cf8-2e12f2610ed4',
 2: 'ee7649ec-57b2-4e82-8b3d-044df73aee41',
 3: 'd3eae279-b15d-4808-95dd-45be93524a9e',
 4: 'a7bf10b1-4b44-43a7-9d60-c09c9f2be434',
 5: 'c7cb9326-ba35-4b69-b385-4a043ce4e621',
 6: 'fa44abd1-facf-4e2a-b1f3-f71f08f779f7',
 7: '5fa42f39-1efd-4f22-91bb-b28bb323af09',
 8: '7f5f9f62-9d09-4779-87c2-1e9c2a93bf36',
 9: '502c6bb7-f2c7-4bfa-b6f4-50380d72daae'}

In [112]:
vector_store.get_by_ids(['502c6bb7-f2c7-4bfa-b6f4-50380d72daae'])

[Document(id='502c6bb7-f2c7-4bfa-b6f4-50380d72daae', metadata={}, page_content='the other steps in a transformer. Also, on my second channel I just posted a talk I gave a couple months ago about this topic for the company TNG in Munich. Sometimes I actually prefer the content I make as a casual talk rather than a produced video, but I leave it up to you which one of these feels like the better follow-on.')]

### Step 2 - Retrieval

In [91]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":4})

In [92]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002857D0F7D10>, search_kwargs={'k': 4})

In [93]:
retriever.invoke('What is deepmind?')

[Document(id='2ed0042e-d136-43cc-95e3-edda41b98b6e', metadata={}, page_content="Imagine you happen across a short movie script that describes a scene between a person and their AI assistant. The script has what the person asks the AI, but the AI's response has been torn off. Suppose you also have this powerful magical machine that can take any text and provide a sensible prediction of what word comes next. You could then finish the script by feeding in what you have to the machine, seeing what it would predict to start the AI's answer, and then repeating this over and over with a growing script completing the dialogue. When you interact with a chatbot, this is exactly what's happening. A large language model is a sophisticated mathematical function that predicts what word comes next for any piece of text. Instead of predicting one word with certainty, though, what it does is assign a probability to all possible next words. To build a chatbot, you lay out some text that describes an int

In [94]:
# till now we take query, pass to the retriever and perform semantic search,
# get the most relevant context and pass further.

### Step 3 - Augmentation

In [95]:
# prompt template
prompt = PromptTemplate(
    template = """
    You are a helpful assistant.
    Answer ONLY from the provided transcript context.
    If the context is insufficient, just say you don't know.

    {context}
    Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [96]:
question = "is the topic of LLM in this video? If yes, then what was that?"
retrieved_docs = retriever.invoke(question)

In [97]:
retrieved_docs

[Document(id='502c6bb7-f2c7-4bfa-b6f4-50380d72daae', metadata={}, page_content='the other steps in a transformer. Also, on my second channel I just posted a talk I gave a couple months ago about this topic for the company TNG in Munich. Sometimes I actually prefer the content I make as a casual talk rather than a produced video, but I leave it up to you which one of these feels like the better follow-on.'),
 Document(id='5fa42f39-1efd-4f22-91bb-b28bb323af09', metadata={}, page_content="type of operation known as a feed-forward neural network, and this gives the model extra capacity to store more patterns about language learned during training. All of this data repeatedly flows through many different iterations of these two fundamental operations, and as it does so, the hope is that each list of numbers is enriched to encode whatever information might be needed to make an accurate prediction of what word follows in the passage. At the end, one final function is performed on the last vec

In [98]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [99]:
context_text

"the other steps in a transformer. Also, on my second channel I just posted a talk I gave a couple months ago about this topic for the company TNG in Munich. Sometimes I actually prefer the content I make as a casual talk rather than a produced video, but I leave it up to you which one of these feels like the better follow-on.\n\ntype of operation known as a feed-forward neural network, and this gives the model extra capacity to store more patterns about language learned during training. All of this data repeatedly flows through many different iterations of these two fundamental operations, and as it does so, the hope is that each list of numbers is enriched to encode whatever information might be needed to make an accurate prediction of what word follows in the passage. At the end, one final function is performed on the last vector in this sequence, which now has had a chance to be influenced by all the other context from the input text, as well as everything the model learned during 

In [100]:
final_prompt = prompt.invoke({"context":context_text, "question":question})

In [101]:
final_prompt

StringPromptValue(text="\n    You are a helpful assistant.\n    Answer ONLY from the provided transcript context.\n    If the context is insufficient, just say you don't know.\n\n    the other steps in a transformer. Also, on my second channel I just posted a talk I gave a couple months ago about this topic for the company TNG in Munich. Sometimes I actually prefer the content I make as a casual talk rather than a produced video, but I leave it up to you which one of these feels like the better follow-on.\n\ntype of operation known as a feed-forward neural network, and this gives the model extra capacity to store more patterns about language learned during training. All of this data repeatedly flows through many different iterations of these two fundamental operations, and as it does so, the hope is that each list of numbers is enriched to encode whatever information might be needed to make an accurate prediction of what word follows in the passage. At the end, one final function is pe

### Step 4 - Generation

In [102]:
from huggingface_hub import InferenceClient

client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta")   

# Call the model with the plain text from the StringPromptValue
response = client.text_generation(
    prompt= final_prompt.text,  # .text gives the raw string
    max_new_tokens=200
)

print(response)


    Answer: Yes, the topic of large language models (LLMs) is discussed in this video. The video explains how LLMs work by repeatedly flowing data through multiple iterations of two fundamental operations called self-attention and feed-forward neural networks. The video also highlights the emergent behavior of LLMs based on how their hundreds of billions of parameters are tuned during training, making it challenging to determine why the model makes specific predictions. The video suggests that LLMs can generate fluent, fascinating, and even useful predictions when used to autocomplete prompts. The video also recommends resources for viewers who want to learn more about LLMs and transformers, including a series on deep learning and a talk given by the author for a company called TNG in Munich.


### Building a Chain

In [103]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [104]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

In [105]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs), # as soon as it gets a question it passes to the retriever and then pass to the format_docs and return the string
    'question': RunnablePassthrough()
})

In [106]:
#parallel_chain.invoke('Why llm needs to learn')

In [107]:
hf_client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta")   

from langchain_core.runnables import Runnable
from typing import Any, Dict

class HuggingFaceTextGenRunnable(Runnable):
    def __init__(self, client: InferenceClient):
        self.client = client

    def invoke(self, input: str, config: Dict[str, Any] = None) -> str:
        response = self.client.text_generation(
            prompt=input.text, 
            max_new_tokens=200
        )
        return response
        
client_runnable = HuggingFaceTextGenRunnable(hf_client)

In [108]:
parser = StrOutputParser()

In [109]:
main_chain = parallel_chain | prompt | client_runnable | parser


In [110]:
main_chain.invoke('Why do we need to learn llm?')

'\n    Answer: Large Language Models (LLMs) are a type of artificial intelligence (AI) technology that can understand and generate human-like text. They are trained on vast amounts of text data, allowing them to make predictions about the next word in a sentence or the meaning of a phrase. LLMs have a wide range of applications, from improving search engine results and generating personalized content to assisting with language learning and providing customer support. As the amount of text data available continues to grow, LLMs are becoming increasingly important in many industries, including healthcare, finance, and education. Therefore, learning LLMs is essential for anyone interested in staying up-to-date with the latest developments in AI and understanding its potential impact on various fields.'

In [111]:
main_chain.invoke('Can you summarize the video?')

"\n    Answer: The video explains how large language models, specifically transformers, predict the next word in a text. It highlights the two fundamental operations in a transformer, self-attention and feed-forward neural networks, and how they repeatedly flow through the model to enrich the list of numbers representing the input text. The video also mentions the emergent phenomenon of the model's behavior based on how the hundreds of billions of parameters are tuned during training. The video suggests watching a series on deep learning to visualize and motivate the details of attention and all the other steps in a transformer. The video also mentions a talk given by the speaker for the company TNG in Munich, which can be found on their second channel. The video concludes by explaining how a large language model predicts what word comes next for any piece of text, assigning a probability to all possible next words, and how this is used to build chatbots by completing"