In [8]:
!pip install langchain
!pip install langchain-community
!python -m pip install python-dotenv

!pip install --upgrade pytube
!pip install yt-dlp
!pip install openai-whisper



In [9]:
import os
from dotenv import load_dotenv

load_dotenv('/content/.env')

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=ISZCSikwSlI&t=51s"

In [10]:
!pip install langchain-openai
!pip install whisper
!pip install pytube



In [11]:
#Setting up the llm model
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [12]:
#test the model which will give an AIMessage instance containing the answer
model.invoke("Which player have scored the most career points in NBA?")

AIMessage(content='Kareem Abdul-Jabbar holds the record for the most career points scored in NBA history with a total of 38,387 points.', response_metadata={'token_usage': {'completion_tokens': 29, 'prompt_tokens': 18, 'total_tokens': 47}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-6eab0064-0b0f-4869-959d-5de5c5229dca-0', usage_metadata={'input_tokens': 18, 'output_tokens': 29, 'total_tokens': 47})

In [13]:
#chaining the AIMessage of the model with an output parser -> StrOutputParser
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("Which player have scored the most career points in NBA?")

'Kareem Abdul-Jabbar is the player who has scored the most career points in NBA history, with a total of 38,387 points.'

In [14]:
#provide the model with some context and the question --> see https://python.langchain.com/v0.1/docs/modules/model_io/prompts/quick_start/
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t\nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n'

In [15]:
#chain the prompt with the model and the output parser -->  https://dashboard.cohere.com/playground/embed to see it more clearly
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana'

In [16]:
#Combining chains, here we can combine different chains to create more complex workflows. Let's create a second chain that translates the answer from the first chain into a different language
#a new prompt template for the translation chain
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

In [17]:
#creation of  a new translation chain that combines the result from the first chain with the translation prompt
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "Mary's sister is Susana. She doesn't have any more siblings.",
        "question": "How many sisters does Mary have?",
        "language": "Spanish",
    }
)

'María tiene una hermana.'

In [19]:
#Transcribing the video
import os
import tempfile
import whisper
from pytube import YouTube
from pytube.exceptions import RegexMatchError, VideoUnavailable, PytubeError
import yt_dlp

# Function to clean the YouTube URL (remove timestamp and other parameters)
def clean_youtube_url(url):
    if "&" in url:
        return url.split("&")[0]
    return url

# Clean the YouTube URL
cleaned_url = clean_youtube_url(YOUTUBE_VIDEO)
print(f"Cleaned URL: {cleaned_url}")

# Check if the transcription file already exists
transcription_file_path = "/content/transcription.txt"

if not os.path.exists(transcription_file_path):
    try:
        print("Attempting to download audio using yt-dlp...")

        # yt-dlp to download audio
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': '%(id)s.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'quiet': True
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(cleaned_url, download=True)
            audio_file = ydl.prepare_filename(info_dict).replace('.webm', '.mp3')

        print(f"Downloaded audio file path: {audio_file}")

        if not os.path.exists(audio_file):
            raise Exception("Failed to download the audio file.")

        # Load the base model
        print("Loading Whisper model...")
        whisper_model = whisper.load_model("base")

        print("Transcribing audio...")
        transcription = whisper_model.transcribe(audio_file, fp16=False)["text"].strip()

        with open(transcription_file_path, "w") as file:
            file.write(transcription)
        print("Transcription completed and saved to transcription.txt")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Transcription file already exists.")


Cleaned URL: https://www.youtube.com/watch?v=ISZCSikwSlI
Attempting to download audio using yt-dlp...
Downloaded audio file path: ISZCSikwSlI.mp3
Loading Whisper model...


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 118MiB/s]


Transcribing audio...
Transcription completed and saved to transcription.txt


In [40]:
#read the transcription
with open("transcription.txt") as file:
    transcription = file.read()

transcription[:200]

"This is the Peter Pan story, roughly speaking. Peter Pan is this magical boy. Pan means the God of everything, roughly speaking, right? And so it's not an accident that he has the name Pan. And he's t"

In [21]:
#If we try to invoke the chain using the transcription as context, the model may return an error because the context is too long. LLMs support limitted context sizes.
"""
try:
    chain.invoke({
        "context": transcription,
        "question": "Is reading papers a good idea?"
    })
except Exception as e:
    print(e)

"""

'\ntry:\n    chain.invoke({\n        "context": transcription,\n        "question": "Is reading papers a good idea?"\n    })\nexcept Exception as e:\n    print(e)\n\n'

In [22]:
#splitting the transcription into smaller chunks. We can then invoke the model using only the relevant chunks to answer a particular question

#loading the transcription in memory
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'transcription.txt'}, page_content="This is the Peter Pan story, roughly speaking. Peter Pan is this magical boy. Pan means the God of everything, roughly speaking, right? And so it's not an accident that he has the name Pan. And he's the boy that won't grow up. And he's magical. Well that's because children are magical. They can be anything. They're nothing but potential. And Peter Pan doesn't want to give that up. Why? He's got some adults around him, but the main adult is Captain Hook. Well, who the hell wants to grow up to be Captain Hook? First of all, you've got a hook. Second, you're a tyrant. And third, you're chased by the dragon of chaos with a clock in its stomach, right? The crocodile. It's already got a piece of you. Well, that's what happens when you get older. Time has already got a piece of you. And eventually, it's got a taste for you. And eventually, it's going to eat you. And so Hook is so traumatized by that that he can't help but be a 

In [23]:
#splitting the transcription into chunks of 100 characters with an overlap of 20 characters and display the first few chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
text_splitter.split_documents(text_documents)[:5]

[Document(metadata={'source': 'transcription.txt'}, page_content='This is the Peter Pan story, roughly speaking. Peter Pan is this magical boy. Pan means the God of'),
 Document(metadata={'source': 'transcription.txt'}, page_content="means the God of everything, roughly speaking, right? And so it's not an accident that he has the"),
 Document(metadata={'source': 'transcription.txt'}, page_content="that he has the name Pan. And he's the boy that won't grow up. And he's magical. Well that's"),
 Document(metadata={'source': 'transcription.txt'}, page_content="Well that's because children are magical. They can be anything. They're nothing but potential. And"),
 Document(metadata={'source': 'transcription.txt'}, page_content="but potential. And Peter Pan doesn't want to give that up. Why? He's got some adults around him,")]

In [24]:
#now with 1000 Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [25]:
"""
For a particular question, we need to find the relevant chunks from the transcription to send to the model.
Here is where the idea of embeddings comes into play
An embedding is a mathematical representation of the semantic meaning of a word, sentence, or document.
It's a projection of a concept in a high-dimensional space.
Embeddings have a simple characteristic: The projection of related concepts will be close to each other,
while concepts with different meanings will lie far away. --> cohere playground

To provide with the most relevant chunks, we can use the embeddings of the question and the chunks of the transcription to compute the similarity between them.
We can then select the chunks with the highest similarity to the question and use them as the context for the model
"""
#generating embeddings
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[-0.001359404530376196, -0.03437049686908722, -0.0114255640655756, 0.001291395165026188, -0.02616560459136963, 0.009161713533103466, -0.015621816739439964, 0.0018229621928185225, -0.011800787411630154, -0.03324482589960098]


In [26]:
#generating the embeddings for two different sentences
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

In [27]:
#computing the similarity between the query and each of the two sentences. The closer the embeddings are, the more similar the sentences will be.
#using cosine similarity to calculate the similarity between the query and each of the sentences -->
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9172681467301314, 0.7680251090423714)

In [28]:
!pip install docarray

Collecting docarray
  Downloading docarray-0.40.0-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting types-requests>=2.28.11.6 (from docarray)
  Downloading types_requests-2.32.0.20240712-py3-none-any.whl (15 kB)
Installing collected packages: types-requests, docarray
Successfully installed docarray-0.40.0 types-requests-2.32.0.20240712


In [29]:
#setting up a vector store to store document chunks, their embeddings, and perform similarity searches at scale
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore1 = DocArrayInMemorySearch.from_texts(
    [
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
    ],
    embedding=embeddings,
)



In [30]:
#quering the vector store to find the most similar embeddings to a given query
vectorstore1.similarity_search_with_score(query="Who is Mary's sister?", k=3)

[(Document(page_content="Mary's sister is Susana"), 0.9172681550033172),
 (Document(page_content='Mary has two siblings'), 0.9045628481161785),
 (Document(page_content='John and Tommy are brothers'), 0.8015500435454905)]

In [31]:
# Connecting the vector store to the chain --> to use the vector store to find the most relevant chunks from the transcription to send to the model
#The retriever will run a similarity search in the vector store and return the most similar documents back to the next step in the chain -->https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/
retriever1 = vectorstore1.as_retriever() #We can get a retriever directly from the vector store we created before
retriever1.invoke("Who is Mary's sister?")

[Document(page_content="Mary's sister is Susana"),
 Document(page_content='Mary has two siblings'),
 Document(page_content='John and Tommy are brothers'),
 Document(page_content="Pedro's mother is a teacher")]

In [32]:
"""create a map with the two inputs by using the RunnableParallel and RunnablePassthrough classes to pass the context and question
to the prompt as a map with the keys "context" and "question."
the retriever will find the chunks to use as the context to answer the question.
- RunnableParallel --> https://python.langchain.com/v0.1/docs/expression_language/primitives/parallel/
- RunnablePassthrough --> https://python.langchain.com/v0.1/docs/expression_language/primitives/passthrough/
"""
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("What color is Patricia's car?")

{'context': [Document(page_content='Patricia likes white cars'),
  Document(page_content='Lucia drives an Audi'),
  Document(page_content="Pedro's mother is a teacher"),
  Document(page_content="Mary's sister is Susana")],
 'question': "What color is Patricia's car?"}

In [33]:
#add the setup map to the chain
chain = setup | prompt | model | parser
chain.invoke("What color is Patricia's car?")

'White'

In [34]:
#invoking the whain with another example
chain.invoke("What car does Lucia drive?")

'Lucia drives an Audi.'

In [35]:
#Loading transcription into the vector store so creating a new vector store using the chunks from the video transcription
vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)

In [36]:
#new chain using the correct vector store with a different equivalent syntax to specify the RunnableParallel portion of the chain
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("what is an infant?")

'An infant is a young child in the early stages of life, typically under the age of one year.'

In [37]:
!pip install langchain_pinecone

Collecting langchain_pinecone
  Downloading langchain_pinecone-0.1.2-py3-none-any.whl (8.5 kB)
Collecting pinecone-client<5,>=3.2.2 (from langchain_pinecone)
  Downloading pinecone_client-4.1.2-py3-none-any.whl (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.4/216.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client<5,>=3.2.2->langchain_pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-client, langchain_pinecone
Successfully installed langchain_pinecone-0.1.2 pinecone-client-4.1.2 pinecone-plugin-interface-0.0.7


In [38]:
"""
So far we've used an in-memory vector store
Setting up Pinecone to use it as a vector store that can handle large amounts of data and perform similarity searches at scale
create a Pinecone account, set up an index, get an API key, and set it as an environment variable (.env) PINECONE_API_KEY
"""
#load the transcription documents into Pinecone
from langchain_pinecone import PineconeVectorStore

index_name = "rag-on-video-index"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [39]:
#testing the similarity search on pinecone
pinecone.similarity_search("What is the problem with being a child?")[:3]

[Document(metadata={'source': 'transcription.txt'}, page_content="22. You're an old infant, right? And that's an ugly thing, an old infant. So part of the reason you choose your damn sacrifice because the sacrifice is inevitable. But at least you get to choose it. And then there's something that's even more complex than that. In some sense, is that the problem with being a child is that all you are is potential. And it's really low resolution. You could be anything, but you're not anything. So then you go and you adopt an apprenticeship, roughly speaking. And then you become at least you become something. And when you're something, that makes the world open up to you again. Like if you're a really good plumber, then you end up in far more than a plumber, right? You end up being a good employer. If you're a really good plumber, well, then you have some employees. You run a business. You train some other people. You enlarge their lives. You're kind of a pillar of the community. You have 

In [41]:
#setting up the new chain using Pinecone as the vector store
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What is the problem with being a child?")

"The problem with being a child is that all you are is potential, and it's low resolution. You could be anything, but you're not anything until you adopt an apprenticeship and become something."