In [3]:
from langchain_huggingface import ChatHuggingFace,HuggingFaceEndpoint

In [4]:
from dotenv import load_dotenv

In [5]:
load_dotenv()

True

In [6]:
import os

In [8]:
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
llm = HuggingFaceEndpoint(
    repo_id = 'openai/gpt-oss-20b',
    task='text-generation',
    huggingfacehub_api_token = api_key
)

In [9]:
model = ChatHuggingFace(llm=llm)

In [10]:
import yt_dlp
import requests

In [11]:
video_url = 'https://www.youtube.com/watch?v=eMlx5fFNoYc'

def get_text_url(url,lang='en'):

    ydl_opts = {
        "skip_download":True,
        "writesubtitles":True,
        "writeautomaticsub":True,
        'subtitleslangs':[lang],
        'quiet':True
    }

    with yt_dlp.YoutubeDL(ydl_opts) as yts:
        info = yts.extract_info(url,download=False)
        subs = info.get('subtitles') or info.get('automatic updates')

        if not subs or lang not in subs:
            raise ValueError(f"no sub found on lang {lang}")
        sub_url = subs[lang][0]['url']


        res = requests.get(sub_url)

        res.raise_for_status()
        data = res.json() 
        text_segments = []

        for event in data.get("events", []):
            if "segs" in event:
                for seg in event["segs"]:
                    text_segments.append(seg.get("utf8", ""))

        return " ".join(text_segments)

In [12]:
text= get_text_url(video_url)



In [13]:
len(text)

27820

In [15]:
text[:500]

"In the last chapter, you and I started to step through the internal workings of a transformer. This is one of the key pieces of technology inside large language models, and a lot of other tools in the modern wave of AI. It first hit the scene in a now-famous 2017 paper called Attention is All You Need, and in this chapter you and I will dig into what this attention mechanism is, visualizing how it processes data. As a quick recap, here's the important context I want you to have in mind. The goal"

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [19]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [21]:
chunks = splitter.create_documents([text])

In [22]:
len(chunks)

35

In [28]:
chunks[34]

Document(metadata={}, page_content="let you do this. If you want to learn more about this stuff, I've left lots of links in the description. In particular, anything produced by Andrej Karpathy or Chris Ola tend to be pure gold. In this video, I wanted to just jump into attention in its current form, but if you're curious about more of the history for how we got here and how you might reinvent this idea for yourself, my friend Vivek just put up a couple videos giving a lot more of that motivation. Also, Britt Cruz from the channel The Art of the Problem has a really nice video about the history of large language models. Thank you.")

In [37]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

In [38]:
model_emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [39]:
vec_store = FAISS.from_documents(chunks, model_emb)

In [44]:
vec_store.index_to_docstore_id

{0: '644997cc-73e4-41bf-82de-994ee9f553dd',
 1: '23530458-c281-4409-85e1-682b1c43c52e',
 2: 'd07c461b-3f57-4e71-9724-2228af866b88',
 3: '36ac8408-9c9b-421f-9f2c-692c1f42e6b7',
 4: 'e94665d2-e7fa-4f3f-9dd8-fcbf7312ac8e',
 5: 'cd9943ed-0a0c-4fef-a4b7-5ba36b53f130',
 6: '51630656-98b6-4f3f-a2f0-4c28681ff059',
 7: 'd4608948-15e4-4507-9fc0-8d55135fb67a',
 8: 'a6a26522-f602-43e1-bb4b-7ab783754322',
 9: '0f99bd93-1a7c-4d25-8391-b426987ac15a',
 10: 'ae4268a3-1fb3-47d8-a78a-6eb20e10647a',
 11: 'd1e53817-2167-471a-bd38-7443325635f3',
 12: 'f84c60de-2fa0-4619-9ee6-5c252ab5836c',
 13: '61f0b5cd-3207-4c3c-bde5-1b2a934f02c9',
 14: '22343457-1008-476f-8ef0-18ccbbdbe081',
 15: '8baaca5f-727e-4634-9774-f69cf3bc4b00',
 16: 'a41c8ad7-42c4-4231-bc4b-9a57ac831507',
 17: 'c1c2a4e1-c9e8-4f8a-b267-9355cfbff942',
 18: '25fdd3c3-f49b-4fea-b058-4c9afa3378b2',
 19: '3ee76f6b-6957-453f-80df-17041da0d9d2',
 20: 'f433862d-2e16-4d43-9142-550af6258d6f',
 21: '0ce22b81-fa58-43eb-bbe6-a2b0a9a8891e',
 22: '92efe8ce-f4d2-

In [45]:
vec_store.get_by_ids(['08c1c2bb-abca-48ff-a395-8d42e5c82d24'])

[Document(id='08c1c2bb-abca-48ff-a395-8d42e5c82d24', metadata={}, page_content="let you do this. If you want to learn more about this stuff, I've left lots of links in the description. In particular, anything produced by Andrej Karpathy or Chris Ola tend to be pure gold. In this video, I wanted to just jump into attention in its current form, but if you're curious about more of the history for how we got here and how you might reinvent this idea for yourself, my friend Vivek just put up a couple videos giving a lot more of that motivation. Also, Britt Cruz from the channel The Art of the Problem has a really nice video about the history of large language models. Thank you.")]

In [46]:
ret = vec_store.as_retriever(search_type='similarity',search_kwargs = {'k':4})

In [49]:
(ret.invoke('what is transformer?'))

[Document(id='61f0b5cd-3207-4c3c-bde5-1b2a934f02c9', metadata={}, page_content="be between 0 and 1, and for each column to add up to 1, as if they were a probability distribution. If you're coming in from the last chapter, you know what we need to do then. We compute a softmax along each one of these columns to normalize the values. In our picture, after you apply softmax to all of the columns, we'll fill in the grid with these normalized values. At this point you're safe to think about each column as giving weights according to how relevant the word on the left is to the corresponding value at the top. We call this grid an attention pattern. Now if you look at the original transformer paper, there's a really compact way that they write this all down. Here the variables q and k represent the full arrays of query and key vectors respectively, those little vectors you get by multiplying the embeddings by the query and the key matrices. This expression up in the numerator is a really comp

In [66]:
f_llm = HuggingFaceEndpoint(
    repo_id="openai/gpt-oss-20b",       
    task="text-generation",             
    huggingfacehub_api_token=api_key,   
    temperature=0.2
)

In [67]:
model2 = ChatHuggingFace(llm=f_llm)

In [54]:
from langchain_core.prompts import PromptTemplate

In [55]:
template = PromptTemplate(
    template = '''
        answer only the provided transcript context.
        If the context is insufficient, just say don't know 
        
        {context},
        Question: {question}
        ''',
    input_variables=['context','question']

    
)

In [56]:
query = 'what is attension?'
retrieved_docs = ret.invoke(query)

In [57]:
retrieved_docs

[Document(id='0ce22b81-fa58-43eb-bbe6-a2b0a9a8891e', metadata={}, page_content="changes to the corresponding embeddings, produces a full sequence of more refined embeddings popping out of the attention block. Zooming out, this whole process is what you would describe as a single head of attention. As I've described things so far, this process is parameterized by three distinct matrices, all filled with tunable parameters, the key, the query, and the value. I want to take a moment to continue what we started in the last chapter, with the scorekeeping where we count up the total number of model parameters using the numbers from GPT-3. These key and query matrices each have 12,288 columns, matching the embedding dimension, and 128 rows, matching the dimension of that smaller key query space. This gives us an additional 1.5 million or so parameters for each one. If you look at that value matrix by contrast, the way I've described things so far would suggest that it's a square matrix that h

In [58]:
context_text = '\n\n'.join(doc.page_content for doc in retrieved_docs)

In [60]:
final_prompt = template.invoke({'context':context_text,'question':query})

In [61]:
final_prompt

StringPromptValue(text="\n        answer only the provided transcript context.\n        If the context is insufficient, just say don't know \n\n        changes to the corresponding embeddings, produces a full sequence of more refined embeddings popping out of the attention block. Zooming out, this whole process is what you would describe as a single head of attention. As I've described things so far, this process is parameterized by three distinct matrices, all filled with tunable parameters, the key, the query, and the value. I want to take a moment to continue what we started in the last chapter, with the scorekeeping where we count up the total number of model parameters using the numbers from GPT-3. These key and query matrices each have 12,288 columns, matching the embedding dimension, and 128 rows, matching the dimension of that smaller key query space. This gives us an additional 1.5 million or so parameters for each one. If you look at that value matrix by contrast, the way I'v

In [68]:
ans = model2.invoke(final_prompt)
ans

AIMessage(content='According to the transcript, **attention** is the process by which embeddings are repeatedly updated to incorporate information from all other embeddings in the sequence.  \n- Each *attention head* receives the current embeddings (vectors that encode word identity and position), and uses three learned matrices – **key**, **query**, and **value** – to compute a new, context‑refined embedding for each token.  \n- The key and query matrices project the embeddings into a smaller “key/query space” (128‑dimensional in the example), while the value matrix maps them back into the full embedding dimension (12,288‑dimensional).  \n- Multiple heads run in parallel, and the whole block of heads is repeated across many layers (e.g., 96 layers in GPT‑3), yielding a highly expressive, context‑aware representation of the input.', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 281, 'prompt_tokens': 903, 'total_tokens': 1184}, 'model_name': 'openai/gpt-o

In [69]:
ans.content

'According to the transcript, **attention** is the process by which embeddings are repeatedly updated to incorporate information from all other embeddings in the sequence.  \n- Each *attention head* receives the current embeddings (vectors that encode word identity and position), and uses three learned matrices – **key**, **query**, and **value** – to compute a new, context‑refined embedding for each token.  \n- The key and query matrices project the embeddings into a smaller “key/query space” (128‑dimensional in the example), while the value matrix maps them back into the full embedding dimension (12,288‑dimensional).  \n- Multiple heads run in parallel, and the whole block of heads is repeated across many layers (e.g., 96 layers in GPT‑3), yielding a highly expressive, context‑aware representation of the input.'

## Using Chains easy-one 

In [70]:
from langchain_core.runnables import RunnableLambda,RunnableParallel,RunnablePassthrough

In [71]:
from langchain_core.output_parsers import StrOutputParser

In [72]:
parser = StrOutputParser()

In [73]:
def merged_doc(retrieved_docs):
    context_text = '\n\n'.join(doc.page_content for doc in retrieved_docs)
    return context_text

In [74]:
parallel_chain = RunnableParallel({
    'context': ret | RunnableLambda(merged_doc),
    'question': RunnablePassthrough()
})

In [75]:
chain = parallel_chain | template | model2 | parser

In [76]:
res = chain.invoke('what is attension?')

In [79]:
res

'Attention is the mechanism inside a transformer that lets each token “look at” the other tokens in the sequence.  \nIt works by projecting every input embedding into three matrices—**key**, **query** and **value**—and computing similarity scores between queries and keys. Those scores are turned into weights that mix the value vectors, producing a new, context‑aware representation for each position. In a single attention head this is parametrized by three learnable matrices; in a full transformer the block is a collection of many such heads running in parallel. The result is that embeddings become progressively richer, capturing relationships, sentiment, tone, etc., rather than just isolated word meaning.'