In [112]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

# Indexing

In [113]:
# extract the transcript
V_id="zIGQ9p6mxUg"
try :
  trans_list=YouTubeTranscriptApi.get_transcript(V_id,languages=["en"])
  transcript=[i["text"] for i in trans_list]
  transcript_f=" ".join(transcript)
  print(transcript_f)

except TranscriptsDisabled:
    print("no caption availiable")




In my last video, we talked about why podcasts and transcripts are such a powerful combination. In this video, we are going to talk about how. We'll answer questions like, should you listen first or should you read first? Should you listen and read at the same time? How many times should you do every activity? and what is the most effective way to actually make this a part of your English study routine? So, if you've ever asked yourself any of those questions, don't worry. In this video, I will give you a stepbystep guide on how to use podcasts and transcripts to improve your English fluency, and I'll talk you through a 90-minute study routine that is super helpful for English learning. Now, the first step is, of course, to choose a podcast. As I've said many times before, you need to choose one that talks about something that's actually interesting and relevant to you. Look for podcasts that have clear speech, are about a topic that you enjoy, and are at a level that is appropriate fo

In [114]:
# divided to sub document
splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
'''
every document will have 1000 character > max
next document will have the last 200 character from the previous document >> to keep track the context
'''
sub=splitter.create_documents([transcript_f])

len(sub)

sub[0]

Document(metadata={}, page_content="In my last video, we talked about why podcasts and transcripts are such a powerful combination. In this video, we are going to talk about how. We'll answer questions like, should you listen first or should you read first? Should you listen and read at the same time? How many times should you do every activity? and what is the most effective way to actually make this a part of your English study routine? So, if you've ever asked yourself any of those questions, don't worry. In this video, I will give you a stepbystep guide on how to use podcasts and transcripts to improve your English fluency, and I'll talk you through a 90-minute study routine that is super helpful for English learning. Now, the first step is, of course, to choose a podcast. As I've said many times before, you need to choose one that talks about something that's actually interesting and relevant to you. Look for podcasts that have clear speech, are about a topic that you enjoy, and a

In [115]:
from langchain.embeddings import HuggingFaceEmbeddings
# converted to numerical representation using embedding

embd = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

#text-embedding-3-small >> suatiable for semantic search
vector=FAISS.from_documents(sub,embd)
# convert it to vector assign an index for each vector 

vector

<langchain_community.vectorstores.faiss.FAISS at 0x21c7541bda0>

In [116]:
'''
docstore > فيه عنوان النص نفسه

index>      
هو رقم الـ 
vector 
داخل 
FAISS index 
ده موقعه داخل الفهرس اللي بيتخزن فيه الـ 
embeddings
 
'''
vector.index_to_docstore_id


{0: '9e98c1b5-6f33-4824-a531-63ef3043e885',
 1: '0600d6f9-f7a4-4693-adb1-0d8b2b22a359',
 2: '610d68f9-f810-4084-b92b-f765172b2ff0',
 3: 'c8cfc2ce-9065-4567-b196-72f21b7cf7c9',
 4: 'a421e55f-1151-4ecf-a632-341c668eb4a5',
 5: 'cc49ae14-40d4-4add-a535-80192045bb49',
 6: 'bf6e716d-558c-4ebd-a69b-b3ff831d9e7e',
 7: 'f4b8df01-1874-4bf1-9642-292787e06e9d',
 8: '17366ec6-3b85-4d0d-862d-40bbc6b735b5',
 9: '741dc8a7-73bf-4b94-b18a-6de83263202a',
 10: 'e121e6c3-9f48-4f98-86da-91a6e47680ae',
 11: '4684182c-2117-43a1-b6e7-086c228678a9',
 12: '6b4fcaaa-5d9c-4ce9-a633-d3bc5edb8723',
 13: 'e4e7860a-22d5-4d7c-bbc5-7c5bf07e2a2c',
 14: 'f41e6911-9711-4006-b103-aa9eb2a3dea9'}

In [117]:
vector.get_by_ids(['c6c605da-f29f-4978-9902-b6c5ba74e75f'])

[]

# Retrieval 

In [137]:
# Query > retriver > semmantic search > vector> most relevent document

ret=vector.as_retriever(search_type="similarity",search_kwargs={"k":4})
ret

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021C7541BDA0>, search_kwargs={'k': 4})

In [119]:
ret.invoke("listen first or read first ?")
# need to join it to put in relevent context

[Document(id='c8cfc2ce-9065-4567-b196-72f21b7cf7c9', metadata={}, page_content="the transcript for the first listen. So just press play, close your eyes so there are no distractions and just listen. Try to listen for overall meaning and try to understand what is being said. It doesn't matter if you don't understand everything. That is completely to be expected. That is real life. The objective of this first bit is to train your ears and your brain to understand real life English. So, first just listen, forget about the transcript. The second step is to go back and listen again, this time with the transcript next to you. And this is where the magic really happens. As you're listening and you're reading the transcript at the same time, you'll find that you'll notice, you'll pick up all the little details that you might have missed the first time round. So the words that you didn't catch, the expressions that you didn't know, how words are written, how the English that you have heard appe

# Augmentation

In [120]:
!pip install -U langchain langchain-community




In [122]:
Q="listen or read first ?"
related_answer=ret.invoke(Q)
related_answer

[Document(id='c8cfc2ce-9065-4567-b196-72f21b7cf7c9', metadata={}, page_content="the transcript for the first listen. So just press play, close your eyes so there are no distractions and just listen. Try to listen for overall meaning and try to understand what is being said. It doesn't matter if you don't understand everything. That is completely to be expected. That is real life. The objective of this first bit is to train your ears and your brain to understand real life English. So, first just listen, forget about the transcript. The second step is to go back and listen again, this time with the transcript next to you. And this is where the magic really happens. As you're listening and you're reading the transcript at the same time, you'll find that you'll notice, you'll pick up all the little details that you might have missed the first time round. So the words that you didn't catch, the expressions that you didn't know, how words are written, how the English that you have heard appe

In [123]:
full_context="\n".join(i.page_content for i in related_answer)
print(full_context)
    

the transcript for the first listen. So just press play, close your eyes so there are no distractions and just listen. Try to listen for overall meaning and try to understand what is being said. It doesn't matter if you don't understand everything. That is completely to be expected. That is real life. The objective of this first bit is to train your ears and your brain to understand real life English. So, first just listen, forget about the transcript. The second step is to go back and listen again, this time with the transcript next to you. And this is where the magic really happens. As you're listening and you're reading the transcript at the same time, you'll find that you'll notice, you'll pick up all the little details that you might have missed the first time round. So the words that you didn't catch, the expressions that you didn't know, how words are written, how the English that you have heard appears on the page, how it is written down. So here you should be using the
that yo

# Generation

In [None]:
from huggingface_hub import InferenceClient

client = InferenceClient(
    model="HuggingFaceH4/zephyr-7b-beta",
    token="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"
)

# بنبعت البرومبت على شكل محادثة
messages = [
    {"role": "system", "content": "You are a helpful assistant.Answer ONLY from the provided transcript context.If the context is insufficient, just say you don't know."},
    {"role": "user", "content": f"question: {Q}\n\ncontext:\n{full_context}:"}
]

response = client.chat_completion(messages=messages, max_tokens=1000)
print(response)


ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='The context suggests to listen first as the initial step without the transcript to train your ears and brain to understand real life English, and then read along with the transcript to further comprehend the podcast content with the objective to notice the unfamiliar words and expressions, check them in the transcript, and collect them for vocabulary learning. The video will later discuss how to make the most of podcasts and transcripts for English fluency with suggestions for choosing relevant and appropriate podcasts, simultaneously listening and reading, and a 90-minute study routine. Therefore, the answer is to listen first.', tool_call_id=None, tool_calls=None), logprobs=None)], created=1751508659341, id='BperqW', model='HuggingFaceH4/zephyr-7b-beta', system_fingerprint='', usage=ChatCompletionOutputUsage(completion_tokens=111, pr

In [None]:
    token="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"


# Building a chain

In [125]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser


In [126]:
def formating_docs(related_answer):
  context_text = "\n\n".join(doc.page_content for doc in related_answer)
  return context_text

In [127]:
parallel_chain = RunnableParallel({
    'context': ret | RunnableLambda(formating_docs),
    'question': RunnablePassthrough()
})


In [128]:
parallel_chain.invoke('listen or read first ?')

{'context': "the transcript for the first listen. So just press play, close your eyes so there are no distractions and just listen. Try to listen for overall meaning and try to understand what is being said. It doesn't matter if you don't understand everything. That is completely to be expected. That is real life. The objective of this first bit is to train your ears and your brain to understand real life English. So, first just listen, forget about the transcript. The second step is to go back and listen again, this time with the transcript next to you. And this is where the magic really happens. As you're listening and you're reading the transcript at the same time, you'll find that you'll notice, you'll pick up all the little details that you might have missed the first time round. So the words that you didn't catch, the expressions that you didn't know, how words are written, how the English that you have heard appears on the page, how it is written down. So here you should be usin

In [None]:

def build_messages(inputs):
    return [
        {"role": "system", "content": "You are a helpful assistant. Answer ONLY from the provided context. If the context is insufficient, say 'I don't know'."},
        {"role": "user", "content": f"question: {inputs['question']}\n\ncontext:\n{inputs['context']}"}
    ]


In [139]:
zephyr_chat_client = RunnableLambda(
    lambda messages: client.chat_completion(messages=messages, max_tokens=512)
)


main_chain = (
    parallel_chain
    | RunnableLambda(build_messages)
    | zephyr_chat_client
    | RunnableLambda(lambda x: x['content'] if isinstance(x, dict) and 'content' in x else x)
)


query = "Can you summarize the video"
answer = main_chain.invoke(query)
print(answer)



ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content="In summary, the final activity suggested is to either summarize the podcast in 60 seconds through speaking or writing, with constraints to challenge yourself to be concise and avoid pauses. Speaking allows for the identification of common mistakes and vocabulary practice, while writing provides deeper engagement with the language. Shadowing involves repeating a sentence from the podcast to imitate the speaker's pronunciation, stress, and intonation. This technique may be challenging for beginners. By listening, pausing, and then repeating exactly as they spoke it. Overall, these exercises help in interacting more intimately with the language after listening to the episode once or twice and using transcripts for assistance in summarizing or flashcards.", tool_call_id=None, tool_calls=None), logprobs=None)], created=1751508690814, id='zk