In [4]:
!pip install langchain
!pip install langchain-openai
!pip install pypdf
!pip install chromadb
!pip install langchainhub
!pip install pandas

In [5]:
import os

# Set OPENAI API Key

os.environ["OPENAI_API_KEY"] = "your openai key"

# OR (load from .env file)

# from dotenv import load_dotenv
# load_dotenv("./.env")

A vector database is a way to store these embeddings, these numerical representations that we just discussed.

The pipeline is:
- In coming document
- Create chunks of text from that document
- Embed each chunk
- Store these embeddings

![](2023-07-30-19-32-13.png)

[LangChain for LLM Application Development by Deeplearning.ai](https://learn.deeplearning.ai/langchain/lesson/1/introduction)

In [1]:
import pandas as pd

file_path="./assets-resources/superheroes.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Superhero Name,Superpower,Power Level,Catchphrase
0,Captain Thunder,Bolt Manipulation,90,Feel the power of the storm!
1,Silver Falcon,Flight and Agility,85,"Soar high, fearlessly!"
2,Mystic Shadow,Invisibility and Illusions,78,Disappear into the darkness!
3,Blaze Runner,Pyrokinesis,88,Burn bright and fierce!
4,Electra-Wave,Electric Manipulation,82,Unleash the electric waves!


In [2]:
from langchain.document_loaders.csv_loader import CSVLoader

In [3]:
loader = CSVLoader(file_path)
data = loader.load()
data[:5]

[Document(page_content='Superhero Name: Captain Thunder\nSuperpower: Bolt Manipulation\nPower Level: 90\nCatchphrase: Feel the power of the storm!', metadata={'source': './assets-resources/superheroes.csv', 'row': 0}),
 Document(page_content='Superhero Name: Silver Falcon\nSuperpower: Flight and Agility\nPower Level: 85\nCatchphrase: Soar high, fearlessly!', metadata={'source': './assets-resources/superheroes.csv', 'row': 1}),
 Document(page_content='Superhero Name: Mystic Shadow\nSuperpower: Invisibility and Illusions\nPower Level: 78\nCatchphrase: Disappear into the darkness!', metadata={'source': './assets-resources/superheroes.csv', 'row': 2}),
 Document(page_content='Superhero Name: Blaze Runner\nSuperpower: Pyrokinesis\nPower Level: 88\nCatchphrase: Burn bright and fierce!', metadata={'source': './assets-resources/superheroes.csv', 'row': 3}),
 Document(page_content='Superhero Name: Electra-Wave\nSuperpower: Electric Manipulation\nPower Level: 82\nCatchphrase: Unleash the electri

In [4]:
from langchain.chat_models import ChatOllama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm = ChatOllama(
    model="llama3",
    verbose=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
all_splits = text_splitter.split_documents(data)

In [7]:
# Embed and store
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectordb = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [8]:
question = "What is the name of the thunder super hero?"
docs = vectordb.similarity_search(question)
len(docs)

4

In [10]:
docs

[Document(page_content='Superhero Name: Thunderstrike\nSuperpower: Lightning Control\nPower Level: 91\nCatchphrase: Electrify the battlefield!', metadata={'row': 30, 'source': './assets-resources/superheroes.csv'}),
 Document(page_content='Superhero Name: Captain Thunder\nSuperpower: Bolt Manipulation\nPower Level: 90\nCatchphrase: Feel the power of the storm!', metadata={'row': 0, 'source': './assets-resources/superheroes.csv'}),
 Document(page_content="Superhero Name: Stormbringer\nSuperpower: Weather Manipulation\nPower Level: 93\nCatchphrase: Unleash the storm's fury!", metadata={'row': 20, 'source': './assets-resources/superheroes.csv'}),
 Document(page_content='Superhero Name: Crimson Cyclone\nSuperpower: Super Speed\nPower Level: 91\nCatchphrase: Blazing fast and unstoppable!', metadata={'row': 5, 'source': './assets-resources/superheroes.csv'})]

In [11]:
# QA chain
from langchain.chains import RetrievalQA

# RAG prompt
from langchain import hub

In [13]:
QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-llama")

In [14]:
QA_CHAIN_PROMPT

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt-llama', 'lc_hub_commit_hash': '693a2db5447e3b58c060a6ac02758dc7f1aaaaa4ee6214d127bf70b443158630'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]"))])

In [15]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0.0)

In [16]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [20]:
question = "What is the catch phrase for the super hero with the power of thunder?"
result = qa_chain.invoke({"query": question}, return_sources=True)

In [21]:
result

{'query': 'What is the catch phrase for the super hero with the power of thunder?',
 'result': 'The catchphrase for the superhero with the power of thunder is "Feel the power of the storm!"'}