In [36]:
print("hello")

hello


In [1]:
import os  
from dotenv import load_dotenv

'''
This script demonstrates using the dotenv and os modules to manage environment variables effectively. 
It loads an API key and stores URLs for two YouTube videos, showcasing simple data handling and preparation for video-related operations.
'''

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=1oj3uLVTH4o"  
YOUTUBE_VIDEO1="https://youtu.be/46gAXft4TMA?si=JoMbLtJELdDqCjVq"


#### Seting up The Model



In [5]:
from langchain_openai.chat_models import ChatOpenAI

'''
This snippet initializes an instance of the ChatOpenAI class from the langchain_openai library. 
It uses an API key and specifies a model version ("gpt-3.5-turbo") to create a chat model object. 
This object can be used to interact with OpenAI's GPT-3.5 Turbo model for various natural language processing tasks,
such as generating text, answering questions, or engaging in conversation.
'''

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")


In [6]:
#We can test the model by asking a simple question.
model.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

AIMessage(content='The Los Angeles Dodgers won the 2020 World Series during the COVID-19 pandemic.', response_metadata={'finish_reason': 'stop', 'logprobs': None})

In [7]:
from langchain_core.output_parsers import StrOutputParser

'''
This snippet sets up an `StrOutputParser` to parse outputs, connects it with a `ChatOpenAI` model using a chain, 
and invokes a question about the World Series winner during the COVID-19 pandemic. It illustrates chaining model output to a parser for direct processing.
'''

parser = StrOutputParser()
chain = model | parser
chain.invoke("What MLB team won the World Series during the COVID-19 pandemic?")


'The Los Angeles Dodgers won the World Series during the COVID-19 pandemic in 2020. They defeated the Tampa Bay Rays in six games to win their first championship since 1988.'

### Introducing prompt templates
We want to provide the model with some context and the question. Prompt templates are a simple way to define and reuse prompts.


In [8]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}

"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n\n'

In [9]:
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana'

### Combining chains
We can combine different chains to create more complex workflows. For example, let's create a second chain that translates the answer from the first chain into a different language.

In [10]:
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

In [11]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "Mary's sister is Susana. She doesn't have any more siblings.",
        "question": "How many sisters does Mary have?",
        "language": "English",
    }
)

'Mary has one sister, Susana.'

### Transcribing the YouTube Video
The context we want to send the model comes from a YouTube video. Let's download the video and transcribe it using OpenAI's Whisper.

In [12]:
import tempfile
import whisper
from pytube import YouTube

In [13]:
import os
import tempfile
import whisper
from pytube import YouTube

'''
This snippet is designed to transcribe the audio of a YouTube video into text using the Whisper model. It first checks if a transcription 
file already exists to avoid unnecessary processing. If the file doesn't exist, it uses pytube to download the audio stream of the specified 
YouTube video. The audio is then transcribed using OpenAI's Whisper model loaded with the "base" configuration, chosen for its balance between 
speed and accuracy. The transcription is saved to a file named "transcription.txt". This process demonstrates integrating different tools and 
libraries to perform complex tasks like downloading online content and applying advanced machine learning models for audio transcription.
'''

# Let's do this only if we haven't created the transcription file yet.
if not os.path.exists("transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    # Let's load the base model. This is not the most accurate
    # model but it's fast.
    whisper_model = whisper.load_model("base")
    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("transcription.txt", "w") as file:
            file.write(transcription)


In [14]:
'''
After creating a transcription of a YouTube video's audio, this code snippet opens the generated "transcription.txt" file to read its contents. 
It then assigns the entire transcription text to a variable named 'transcription'. Finally, it extracts and displays the first 100 characters 
of the transcription to provide a quick preview or snippet of the transcribed audio content. This could be useful for verifying the transcription's 
initial content or for quick inspection purposes without needing to read the entire file.
'''

with open("transcription.txt") as file:
    transcription = file.read()

transcription[:100]


'Hello, my name is Krish Nayak and welcome to my YouTube channel. So guys from the past one and a hal'

In [15]:
'''
This snippet attempts to invoke a processing chain with a specific context and question. The context is set to the contents of a transcription, 
presumably from an audio or video source, and the question posed is "Is reading papers a good idea?". The use of a try-except block is crucial here 
for robust error handling. It ensures that if any part of the invocation process fails (for example, due to an issue with the model, the input data, 
or the connection), the exception is caught, and an error message is printed. This approach helps in diagnosing issues during the invocation process 
without crashing the program.
'''

try:
    chain.invoke({
        "context": transcription,
        "question": "Is reading papers a good idea?"
    })
except Exception as e:
    print(e)


In [16]:
from langchain_community.document_loaders import TextLoader

'''
This snippet introduces the use of `TextLoader` from the `langchain_community` module, aimed at loading text data for processing. 
A `TextLoader` instance is created with a filename ("transcription.txt"), indicating the source file to load. The `load` method is then 
called to read the file's contents into `text_documents`, a variable that holds the loaded text data. This demonstrates a straightforward 
approach to importing textual data from a file, making it ready for further analysis or processing in Python.
'''

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents


[Document(page_content="Hello, my name is Krish Nayak and welcome to my YouTube channel. So guys from the past one and a half years, I've seen a lot of development specifically happening in the field of AI. If we consider Gen.D.V.A.I, if we consider the LLM models, if we consider LLM models, if we consider AI tools that are coming up in the market. And recently, I hope everybody has seen about Devon, which is the first AI software engineer. And obviously, applications like chat, GPT and the other competitors who are building amazing AI tools for images, for text and many more. Now, currently, you can actually see that in our life, I think everybody right now is probably using AI. So that is the reason why I've kept this title of this particular video that it's time to adopt AI in our lives. Now, when I say with respect to adopt, it is not like you just need to learn AI for getting a job or working in this specific field. Yes, if you are interested, you can also go in that specific path

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

'''
Utilizing `RecursiveCharacterTextSplitter` from the `langchain` package, this code snippet demonstrates how to split longer text documents 
into smaller chunks for easier processing or analysis. The splitter is configured to divide the text into chunks of 100 characters with a 
20 character overlap between consecutive chunks. This approach can be particularly useful for handling large texts in tasks that require 
uniform input sizes, such as feeding data into machine learning models. The `.split_documents` method is applied to previously loaded text 
documents, and the first five chunks are retrieved to illustrate the output format and chunking effect.
'''

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
text_splitter.split_documents(text_documents)[:5]


[Document(page_content="Hello, my name is Krish Nayak and welcome to my YouTube channel. So guys from the past one and a half years, I've seen a lot of development specifically happening in the field of AI. If we consider Gen.D.V.A.I, if we consider the LLM models, if we consider LLM models, if we consider AI tools that are coming up in the market. And recently, I hope everybody has seen about Devon, which is the first AI software engineer. And obviously, applications like chat, GPT and the other competitors who are building amazing AI tools for images, for text and many more. Now, currently, you can actually see that in our life, I think everybody right now is probably using AI. So that is the reason why I've kept this title of this particular video that it's time to adopt AI in our lives. Now, when I say with respect to adopt, it is not like you just need to learn AI for getting a job or working in this specific field. Yes, if you are interested, you can also go in that specific path

In [21]:
# For our specific application, let's use 1000 characters instead:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

### Finding the relevant chunks

Let's generate embeddings for an arbitrary query:


In [23]:

from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[-0.001371190081765891, -0.03434698236453119, -0.011476094990116788, 0.0012773800454156574, -0.026166747008526288, 0.009230907949392044, -0.015660022937300136, 0.0017948988196774898, -0.011851335135517721, -0.03324627818637449]


In [24]:
# illustrate how embeddings work, let's first generate the embeddings for two different sentences:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

#use Cosine Similarity to calculate the similarity between the query and each of the sentences:

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.91745489543827, 0.7680495517171415)

### Setting up a Vector Store

In [27]:
from langchain_community.vectorstores import DocArrayInMemorySearch

# Initialize a DocArrayInMemorySearch vector store with a set of texts.
# The texts are embedded using the previously defined OpenAIEmbeddings instance,
# enabling efficient similarity search among the texts based on their semantic content.
vectorstore1 = DocArrayInMemorySearch.from_texts(
    [
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
    ],
    embedding=embeddings,
)


In [22]:
vectorstore1.similarity_search_with_score(query="Who is Mary's sister?", k=3)

[(Document(page_content="Mary's sister is Susana"), 0.9174549036927803),
 (Document(page_content='Mary has two siblings'), 0.9045440036524318),
 (Document(page_content='John and Tommy are brothers'), 0.8015357441152158)]

### Connecting the vector store to the chain

In [23]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Who is Mary's sister?")

[Document(page_content="Mary's sister is Susana"),
 Document(page_content='Mary has two siblings'),
 Document(page_content='John and Tommy are brothers'),
 Document(page_content="Pedro's mother is a teacher")]

In [24]:

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("What color is Patricia's car?")

{'context': [Document(page_content='Patricia likes white cars'),
  Document(page_content='Lucia drives an Audi'),
  Document(page_content="Pedro's mother is a teacher"),
  Document(page_content="Mary's sister is Susana")],
 'question': "What color is Patricia's car?"}

In [25]:
chain = setup | prompt | model | parser
chain.invoke("What color is Patricia's car?")

'White'

In [26]:
chain.invoke("What car does Lucia drive?")

'Lucia drives an Audi.'

### Loading transcription into the vector store

In [27]:
vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)
vectorstore2

<langchain_community.vectorstores.docarray.in_memory.DocArrayInMemorySearch at 0x1d9de630750>

In [28]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("what is adopt ai")

'Adopting AI refers to integrating artificial intelligence technology into various aspects of our daily lives and activities in order to utilize it efficiently and stay ahead of the technological advancements.'

### Setting up Pinecone

In [28]:
import os
from langchain_pinecone import PineconeVectorStore

# Explicitly set your API key here
os.environ['PINECONE_API_KEY'] = 'd8cf6af7-33ad-4cff-aea4-e0131321de28'
# Assuming 'gcp-starter' is a placeholder for your actual environment
os.environ['PINECONE_API_ENV'] = 'gcp-starter'

index_name = "youtube-index"

# Ensure documents and embeddings are defined
# documents = [...]
# embeddings = [...]

pinecone_vector_store = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)


  from tqdm.autonotebook import tqdm


In [31]:
pinecone_vector_store.similarity_search("what are llm models")[:3]

[Document(page_content="going outside, if you are traveling somewhere, some or the other way we are going to use some of the AI applications. And still, when I probably see in the job, job industries and all, there are a lot of openings with respect to AI. I've seen startups who are building some amazing ideas, they wanted to probably use these LLN models, solve some amazing use cases and all. Not only that, and this actually happens in each and every domain, I'm not just talking about one simple text domain or some other domain itself. Every domain and probably if you have seen in Shark Tank, any companies that are building this startup, one or some of the AI functionalities are definitely included. Now, even though in your software engineering team also you'll be seeing now, whatever software product companies are actually building, there will be a separate AI team to include some of the AI modules in those kind of software products. So this really shows the kind of development that 

In [32]:
pinecone_vector_store.similarity_search("what are adopt ai")[:3]

[Document(page_content="Hello, my name is Krish Nayak and welcome to my YouTube channel. So guys from the past one and a half years, I've seen a lot of development specifically happening in the field of AI. If we consider Gen.D.V.A.I, if we consider the LLM models, if we consider LLM models, if we consider AI tools that are coming up in the market. And recently, I hope everybody has seen about Devon, which is the first AI software engineer. And obviously, applications like chat, GPT and the other competitors who are building amazing AI tools for images, for text and many more. Now, currently, you can actually see that in our life, I think everybody right now is probably using AI. So that is the reason why I've kept this title of this particular video that it's time to adopt AI in our lives. Now, when I say with respect to adopt, it is not like you just need to learn AI for getting a job or working in this specific field. Yes, if you are interested, you can also go in that specific path