# -- Advanced Retrieval With LangChain

## SETUP

In [2]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

## LOAD DATA

In [3]:
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [4]:
# load docs
loader = DirectoryLoader(
    "../data/PaulGrahamEssaysLarge", glob="**/*.txt", show_progress=True
)

docs = loader.load()
len(docs)

  0%|          | 0/49 [00:00<?, ?it/s]

100%|██████████| 49/49 [00:07<00:00,  6.59it/s]


49

In [5]:
# split docs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=0)
splits = text_splitter.split_documents(docs)

splits[0]

Document(page_content="July 2006I've discovered a handy test for figuring out what you're addicted\n\nto. Imagine you were going to spend the weekend at a friend's house\n\non a little island off the coast of Maine. There are no shops on\n\nthe island and you won't be able to leave while you're there. Also,\n\nyou've never been to this house before, so you can't assume it will\n\nhave more than any house might.What, besides clothes and toiletries, do you make a point of packing?\n\nThat's what you're addicted to. For example, if you find yourself\n\npacking a bottle of vodka (just in case), you may want to stop and\n\nthink about that.For me the list is four things: books, earplugs, a notebook, and a\n\npen.There are other things I might bring if I thought of it, like music,\n\nor tea, but I can live without them. I'm not so addicted to caffeine\n\nthat I wouldn't risk the house not having any tea, just for a\n\nweekend.Quiet is another matter. I realize it seems a bit eccentric to\n\n

In [6]:
# delete vectordb and start clean
if "vectordb" in globals():
    vectordb.delete_collection()

embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

  warn_deprecated(


## MULTI QUERY

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate

In [8]:
# set logging
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [9]:
# test the retriever, get multi queries from original question
llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0)

# initiate the retiever to work with llm
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)

# query the retriever
question = "What is the authors view on the early stages of a startup?"
unique_docs = retriever_from_llm.get_relevant_documents(query=question)

len(unique_docs)

  warn_deprecated(
INFO:langchain.retrievers.multi_query:Generated queries: ['1. How does the author perceive the initial phases of a startup?', "2. What are the author's thoughts on the early development of a startup?", '3. What insights does the author offer regarding the beginning stages of a startup?']


8

In [10]:
# create prompt template
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [11]:
# invoke the llm to get the answer to the original question
response = llm.invoke(
    input=PROMPT.format_prompt(context=unique_docs, question=question)
)

response.content

'The author believes that releasing a minimal version of a startup early is important, as it allows for quick improvements based on user feedback. The author also emphasizes the importance of getting version 1 done fast and states that many startups fail because they are too slow to release their product.'

## CONTEXTUAL COMPRESSION

- extracts only the relevant topic from the chunk

In [12]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [13]:
llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0, model="gpt-4")

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vectordb.as_retriever()
)

In [14]:
splits[0].dict().keys()

dict_keys(['page_content', 'metadata', 'type'])

In [15]:
splits[0].page_content

"July 2006I've discovered a handy test for figuring out what you're addicted\n\nto. Imagine you were going to spend the weekend at a friend's house\n\non a little island off the coast of Maine. There are no shops on\n\nthe island and you won't be able to leave while you're there. Also,\n\nyou've never been to this house before, so you can't assume it will\n\nhave more than any house might.What, besides clothes and toiletries, do you make a point of packing?\n\nThat's what you're addicted to. For example, if you find yourself\n\npacking a bottle of vodka (just in case), you may want to stop and\n\nthink about that.For me the list is four things: books, earplugs, a notebook, and a\n\npen.There are other things I might bring if I thought of it, like music,\n\nor tea, but I can live without them. I'm not so addicted to caffeine\n\nthat I wouldn't risk the house not having any tea, just for a\n\nweekend.Quiet is another matter. I realize it seems a bit eccentric to\n\ntake earplugs on a tri

In [16]:
compressor.compress_documents(
    documents=[splits[0]], query="what about basketball")



[Document(page_content='What if there was a kid playing basketball? (Thump, thump, thump... thump.) Why risk it? Earplugs are small.', metadata={'source': '../data/PaulGrahamEssaysLarge/island.txt'})]

## PARENT DOCUMENT RETRIEVER (IMP)

In [17]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [18]:
# create child docs
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [19]:
# vectorstore for child docs
vectorstore = Chroma(
    collection_name="return_full_documents",
    embedding_function=OpenAIEmbeddings(api_key=OPENAI_API_KEY),
)

In [20]:
# store parent, child documents
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,  # child docs
    docstore=store,  # parent docs
    child_splitter=child_splitter,
)

In [21]:
retriever.add_documents(docs, ids=None)

In [22]:
# child docs vector store retriever
sub_docs = vectorstore.similarity_search("what is some investing advice?")
sub_docs

[Document(page_content="people there are rich, or expect to be when their options vest.\n\nOrdinary employees find it very hard to recommend an acquisition;\n\nit's just too annoying to see a bunch of twenty year olds get rich\n\nwhen you're still working for salary. Even if it's the right thing\n\nfor your company to do.The Solution(s)Bad as things look now, there is a way for VCs to save themselves.", metadata={'doc_id': '0653ca14-e81b-4a73-abf9-bd7998a275c0', 'source': '../data/PaulGrahamEssaysLarge/vcsqueeze.txt'}),
 Document(page_content="the product is expensive to develop or sell, or simply because\n\nthey're wasteful.If you're paying attention, you'll be asking at this point not just\n\nhow to avoid the fatal pinch, but how to avoid being default dead.\n\nThat one is easy: don't hire too fast. Hiring too fast is by far\n\nthe biggest killer of startups that raise money.", metadata={'doc_id': 'e5e4072b-9f03-4cab-a0cd-b6e91abfa9a1', 'source': '../data/PaulGrahamEssaysLarge/aord.t

In [23]:
# get the parent docs from combined retriever
retrieved_docs = retriever.get_relevant_documents("what is some investing advice?")
retrieved_docs[0].page_content[:1000]

"November 2005In the next few years, venture capital funds will find themselves\n\nsqueezed from four directions. They're already stuck with a seller's\n\nmarket, because of the huge amounts they raised at the end of the\n\nBubble and still haven't invested. This by itself is not the end\n\nof the world. In fact, it's just a more extreme version of the\n\nnorm\n\nin the VC business: too much money chasing too few deals.Unfortunately, those few deals now want less and less money, because\n\nit's getting so cheap to start a startup. The four causes: open\n\nsource, which makes software free; Moore's law, which makes hardware\n\ngeometrically closer to free; the Web, which makes promotion free\n\nif you're good; and better languages, which make development a lot\n\ncheaper.When we started our startup in 1995, the first three were our biggest\n\nexpenses. We had to pay $5000 for the Netscape Commerce Server,\n\nthe only software that then supported secure http connections. We\n\npaid $3000

In [24]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

vectorstore = Chroma(
    collection_name="return_split_parent_documents",
    embedding_function=OpenAIEmbeddings(api_key=OPENAI_API_KEY),
)
store = InMemoryStore()

In [25]:
# setup the full combined retriever
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [26]:
retriever.add_documents(docs)

In [27]:
len(list(store.yield_keys()))

385

In [28]:
# here similarity search performs the search on the child chunks
sub_docs = vectorstore.similarity_search("what is some investing advice?")
sub_docs

[Document(page_content="people there are rich, or expect to be when their options vest.\n\nOrdinary employees find it very hard to recommend an acquisition;\n\nit's just too annoying to see a bunch of twenty year olds get rich\n\nwhen you're still working for salary. Even if it's the right thing\n\nfor your company to do.The Solution(s)Bad as things look now, there is a way for VCs to save themselves.", metadata={'doc_id': 'a62bea98-3506-4e26-aa7c-68e8b09bcc41', 'source': '../data/PaulGrahamEssaysLarge/vcsqueeze.txt'}),
 Document(page_content="the product is expensive to develop or sell, or simply because\n\nthey're wasteful.If you're paying attention, you'll be asking at this point not just\n\nhow to avoid the fatal pinch, but how to avoid being default dead.\n\nThat one is easy: don't hire too fast. Hiring too fast is by far\n\nthe biggest killer of startups that raise money.", metadata={'doc_id': 'a7a06dc2-d468-4841-829b-1a2d2a26783f', 'source': '../data/PaulGrahamEssaysLarge/aord.t

In [29]:
# here the retriever after performing the search on child chunks, gets the parent docs
larger_chunk_relevant_docs = retriever.get_relevant_documents(
    "what is some investing advice?"
)
larger_chunk_relevant_docs[0]

Document(page_content='all practical purposes, succeeding now equals getting bought. Which\n\nmeans VCs are now in the business of finding promising little 2-3\n\nman startups and pumping them up into companies that cost $100\n\nmillion to acquire. They didn\'t mean to be in this business; it\'s\n\njust what their business has evolved into.Hence the fourth problem: the acquirers have begun to realize they\n\ncan buy wholesale. Why should they wait for VCs to make the startups\n\nthey want more expensive? Most of what the VCs add, acquirers don\'t\n\nwant anyway. The acquirers already have brand recognition and HR\n\ndepartments. What they really want is the software and the developers,\n\nand that\'s what the startup is in the early phase: concentrated\n\nsoftware and developers.Google, typically, seems to have been the first to figure this out.\n\n"Bring us your startups early," said Google\'s speaker at the Startup School. They\'re quite\n\nexplicit about it: they like to acquire sta

In [30]:
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

question = "what is some investing advice?"

In [31]:
llm.invoke(
    input=PROMPT.format_prompt(
        context=larger_chunk_relevant_docs, question=question)
).content

"One piece of investing advice is for venture capitalists to not overinflate the value of startups they invest in. This is because acquirers have begun to realize they can buy wholesale and do not need to wait for VCs to make the startups they want more expensive. Another piece of advice is for startups to not hire too fast as this is often the biggest killer of startups that raise money. They should focus on making their product more appealing to boost growth. Lastly, startups should always have a plan B in case they can't raise more money."

## ENSEMBLE RETRIEVER

In [32]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [33]:
# initialize the bm25 - keyword matching retriever
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 2

In [34]:
# main document retriever
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectordb = Chroma.from_documents(splits, embedding)
vectordb = vectordb.as_retriever(search_kwargs={"k": 2})

In [35]:
# combine the retreivers
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5]
)

ensemble_docs = ensemble_retriever.get_relevant_documents(
    "what is some investing advice?"
)

len(ensemble_docs)

3

In [36]:
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

question = "what is some investing advice?"

llm.invoke(input=PROMPT.format_prompt(
    context=ensemble_docs, question=question)).content

"One piece of investing advice is to make a larger number of smaller investments instead of a few large ones. It's also suggested to fund younger, more technical founders instead of MBAs and to let the founders remain as CEOs. Another advice is that the best sources of seed funding are successful startup founders because they can also provide advice. However, it's important to be aware that high valuations startups are getting may not last forever and that there's a risk in having VCs in an angel round."

## SELF QUERYING

In [37]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

In [38]:
# delete existing vector store
if (
    "vectorstore" in globals()
):  # If you've already made your vectordb this will delete it so you start fresh
    vectorstore.delete_collection()

vectorstore = Chroma.from_documents(splits, embeddings)

In [39]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The filename of the essay",
        type="string or list[string]",
    ),
]

In [43]:
document_content_description = "Essays from Paul Graham"

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

In [45]:
retriever.get_relevant_documents(
    "Return only 1 essay. What is one thing you can do to figure out what you like to do from source '../data/PaulGrahamEssaysLarge/island.txt'"
)

[Document(page_content="July 2006I've discovered a handy test for figuring out what you're addicted\n\nto. Imagine you were going to spend the weekend at a friend's house\n\non a little island off the coast of Maine. There are no shops on\n\nthe island and you won't be able to leave while you're there. Also,\n\nyou've never been to this house before, so you can't assume it will\n\nhave more than any house might.What, besides clothes and toiletries, do you make a point of packing?\n\nThat's what you're addicted to. For example, if you find yourself\n\npacking a bottle of vodka (just in case), you may want to stop and\n\nthink about that.For me the list is four things: books, earplugs, a notebook, and a\n\npen.There are other things I might bring if I thought of it, like music,\n\nor tea, but I can live without them. I'm not so addicted to caffeine\n\nthat I wouldn't risk the house not having any tea, just for a\n\nweekend.Quiet is another matter. I realize it seems a bit eccentric to\n\

In [46]:
splits[0].metadata["source"]

'../data/PaulGrahamEssaysLarge/island.txt'

In [53]:
# get the url
import re

for split in splits:
    split.metadata["essay"] = re.search(
        r"[^/]+(?=\.\w+$)", split.metadata["source"]
    ).group()
    

In [54]:
split.metadata

{'source': '../data/PaulGrahamEssaysLarge/gh.txt', 'essay': 'gh'}

In [55]:
metadata_field_info = [
    AttributeInfo(
        name="essay", description="The name of the essay", type="string or list[string]"
    )
]

In [56]:
if (
    "vectorstore" in globals()
):  # If you've already made your vectordb this will delete it so you start fresh
    vectorstore.delete_collection()

vectorstore = Chroma.from_documents(splits, embeddings)

In [57]:
document_content_description = "Essays from Paul Graham"
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

In [58]:
retriever.get_relevant_documents(
    "Tell me about investment advice the 'worked' essay? return only 1"
)

[Document(page_content='should make a larger number of smaller investments instead of a\n\nhandful of giant ones, they should be funding younger, more technical\n\nfounders instead of MBAs, they should let the founders remain as\n\nCEO, and so on.One of my tricks for writing essays had always been to give talks.\n\nThe prospect of having to stand up in front of a group of people\n\nand tell them something that won\'t waste their time is a great\n\nspur to the imagination. When the Harvard Computer Society, the\n\nundergrad computer club, asked me to give a talk, I decided I would\n\ntell them how to start a startup. Maybe they\'d be able to avoid the\n\nworst of the mistakes we\'d made.So I gave this talk, in the course of which I told them that the\n\nbest sources of seed funding were successful startup founders,\n\nbecause then they\'d be sources of advice too. Whereupon it seemed\n\nthey were all looking expectantly at me. Horrified at the prospect\n\nof having my inbox flooded by b