In [64]:
!pip install langchain
!pip install langchain-community
!python -m pip install python-dotenv

!pip install --upgrade pytube
!pip install yt-dlp
!pip install openai-whisper
!pip install faiss-cpu
!pip install openai




In [65]:
import langchain
import langchain_community
from dotenv import load_dotenv
import pytube
import yt_dlp
import whisper
import tempfile
from pytube import YouTube
from pytube.exceptions import RegexMatchError, VideoUnavailable, PytubeError
from langchain.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from langchain.vectorstores import FAISS
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, Runnable
from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import faiss

import os
import openai

In [66]:
# Check if NVIDIA GPU is enabled
#!nvidia-smi

In [67]:
class SimpleOutputParser(Runnable):
    def invoke(self, input, config=None):
        if isinstance(input, AIMessage):
            return input.content.strip()
        return input.strip()

    def __or__(self, other):
        def combined(input, config=None):
            parsed_input = self.invoke(input, config)
            return other.invoke(parsed_input, config)
        return Runnable(combined)


In [68]:
#load_dotenv('C:/Users/mugis/Documents/stagehsbc/RAG_testing/.env')
load_dotenv('/content/.env')
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [69]:
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [70]:
# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=ISZCSikwSlI&t=51s"

In [71]:
response = model.invoke("Which player has scored the most career points in NBA?")
print(response)


content='Kareem Abdul-Jabbar is the NBA player with the most career points, scoring a total of 38,387 points during his career.' response_metadata={'token_usage': {'completion_tokens': 30, 'prompt_tokens': 18, 'total_tokens': 48}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-291659f9-1a64-4e51-9a5f-3c89fbb0ebe2-0'


In [72]:
parser = SimpleOutputParser()
chain = model | parser
response = chain.invoke("Which player has scored the most career points in NBA?")
print(response)


Kareem Abdul-Jabbar holds the record for the most career points scored in NBA history with 38,387 points.


In [73]:
template = """
Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)


In [74]:
chain = prompt | model | parser
response = chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})
print(response)


Susana


In [75]:
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

response = translation_chain.invoke(
    {
        "context": "Mary's sister is Susana. She doesn't have any more siblings.",
        "question": "How many sisters does Mary have?",
        "language": "Spanish",
    }
)
print(response)


María tiene una hermana.


In [76]:
# Function to clean the YouTube URL (remove timestamp and other parameters)
def clean_youtube_url(url):
    if "&" in url:
        return url.split("&")[0]
    return url

# Clean the YouTube URL
cleaned_url = clean_youtube_url(YOUTUBE_VIDEO)
print(f"Cleaned URL: {cleaned_url}")

# Check if the transcription file already exists
transcription_file_path = "transcription.txt"

if not os.path.exists(transcription_file_path):
    try:
        print("Attempting to download audio using yt-dlp...")

        # yt-dlp to download audio
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': '%(id)s.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'quiet': True
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(cleaned_url, download=True)
            audio_file = ydl.prepare_filename(info_dict).replace('.webm', '.mp3')

        print(f"Downloaded audio file path: {audio_file}")

        if not os.path.exists(audio_file):
            raise Exception("Failed to download the audio file.")

        # Load the base model
        print("Loading Whisper model...")
        whisper_model = whisper.load_model("base")

        print("Transcribing audio...")
        transcription = whisper_model.transcribe(audio_file, fp16=False)["text"].strip()

        with open(transcription_file_path, "w") as file:
            file.write(transcription)
        print("Transcription completed and saved to transcription.txt")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Transcription file already exists.")


Cleaned URL: https://www.youtube.com/watch?v=ISZCSikwSlI
Transcription file already exists.


In [77]:
with open("transcription.txt") as file:
    transcription = file.read()

print(transcription[:200])


This is the Peter Pan story, roughly speaking. Peter Pan is this magical boy. Pan means the God of everything, roughly speaking, right? And so it's not an accident that he has the name Pan. And he's t


In [78]:
loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)


In [79]:
embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Mary's sister?")
print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])


Embedding length: 1536
[-0.0013594045786472944, -0.03437049808954927, -0.01142556447128598, 0.0012913952108823416, -0.02616560552048414, 0.009161713858426773, -0.015621817294155089, 0.001822962257550091, -0.01180078783066434, -0.03324482708009158]


In [80]:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

print(query_sentence1_similarity, query_sentence2_similarity)


0.9172681467301319 0.768025109042372


In [81]:
vectorstore = FAISS.from_texts(
    [
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
    ],
    embedding=embeddings,
)

vectorstore.similarity_search_with_score("Who is Mary's sister?", k=3)


[(Document(page_content="Mary's sister is Susana"), 0.16546372),
 (Document(page_content='Mary has two siblings'), 0.19087431),
 (Document(page_content='John and Tommy are brothers'), 0.39689997)]

In [82]:
retriever = vectorstore.as_retriever()
response = retriever.invoke("Who is Mary's sister?")
print(response)


[Document(page_content="Mary's sister is Susana"), Document(page_content='Mary has two siblings'), Document(page_content='John and Tommy are brothers'), Document(page_content="Pedro's mother is a teacher")]


In [83]:
setup = RunnableParallel(context=retriever, question=RunnablePassthrough())
response = setup.invoke("What color is Patricia's car?")
print(response)

chain = setup | prompt | model | parser
response = chain.invoke("What color is Patricia's car?")
print(response)

response = chain.invoke("What car does Lucia drive?")
print(response)


{'context': [Document(page_content='Patricia likes white cars'), Document(page_content='Lucia drives an Audi'), Document(page_content="Pedro's mother is a teacher"), Document(page_content="Mary's sister is Susana")], 'question': "What color is Patricia's car?"}
White
Lucia drives an Audi.


In [84]:
vectorstore2 = FAISS.from_documents(documents, embeddings)

chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
response = chain.invoke("What is an infant?")
print(response)


An infant is a very young child, typically under the age of one year.


In [85]:
# So far we've used an in-memory vector store
# Setting up FAISS to use it as a vector store that can handle large amounts of data and perform similarity searches at scale

# Load the transcription documents into FAISS

vectorstore2 = FAISS.from_documents(documents, embeddings)

# Testing the similarity search on FAISS
results = vectorstore2.similarity_search("What is the problem with being a child?", k=5)
print(results)



[Document(metadata={'source': 'transcription.txt'}, page_content="22. You're an old infant, right? And that's an ugly thing, an old infant. So part of the reason you choose your damn sacrifice because the sacrifice is inevitable. But at least you get to choose it. And then there's something that's even more complex than that. In some sense, is that the problem with being a child is that all you are is potential. And it's really low resolution. You could be anything, but you're not anything. So then you go and you adopt an apprenticeship, roughly speaking. And then you become at least you become something. And when you're something, that makes the world open up to you again. Like if you're a really good plumber, then you end up in far more than a plumber, right? You end up being a good employer. If you're a really good plumber, well, then you have some employees. You run a business. You train some other people. You enlarge their lives. You're kind of a pillar of the community. You have 

In [86]:
# Setting up the new chain using FAISS as the vector store
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

response = chain.invoke("What is the problem with being a child?")
print(response)

The problem with being a child is that all you are is potential, which is low resolution. You could be anything, but you're not anything until you adopt an apprenticeship and become something.
