In [1]:
import os
import vertexai

from langchain_google_community import VertexAISearchRetriever
from langchain_google_community import GCSDirectoryLoader
from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from langchain.chat_models import ChatVertexAI

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate

from langchain_community.vectorstores import Chroma

### Using Vertex search

In [2]:
PROJECT_ID = "TODO" 
REGION = "us-central1"
DATA_STORE_LOCATION="global"
DATA_STORE_ID="TODO"

In [3]:
vertexai.init(project=PROJECT_ID, location=REGION)

In [4]:
prompt_template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know and explain what piece of information is missing to answer the question.
    Only answer in plaintext, do not answer in markdown

    {context}

    Question: {question}
    Answer:"""

In [5]:
retriever = VertexAISearchRetriever(
    project_id=PROJECT_ID,
    location_id=DATA_STORE_LOCATION,
    data_store_id=DATA_STORE_ID,
    engine_type=1
)

result = retriever.get_relevant_documents("How much were Google's R&D expenses in 2004?")
for doc in result:
    print(doc)

  warn_deprecated(


page_content='Research and development expenses increased by $134.4 million to <b>$225.6 million</b> (or 7.1% of revenues) in 2004, from $91.2 million (or 6.2% of revenues) in 2003.' metadata={'id': '3cbf4b88a6126487272f015eccf2754f', 'source': 'gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs/2004_google_annual_report.pdf:39'}
page_content='Cost of revenues increased by $831.8 million to $1457.7 million (or 45.7% of revenues) in 2004, from $625.9 million (or 42.7% of revenues) in 2003.' metadata={'id': '3cbf4b88a6126487272f015eccf2754f', 'source': 'gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs/2004_google_annual_report.pdf:39'}
page_content='Research and development expenses increased by $258.4 million to $484.0 million (or 7.9% of revenues) in 2005, from <b>$225.6 million</b> (or 7.1% of revenues) in 2004.' metadata={'id': 'adc537ccece262a5bd00f26d0dd10a7d', 'source': 'gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs/2

In [6]:
llm = VertexAI(
    model_name="gemini-1.5-pro-001",
    temperature=0,
    verbose=True
)

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [7]:
response = llm.predict(prompt.format(context =result, question="How much were Google's R&D expenses in 2004?"))
print(response)

  warn_deprecated(


$225.6 million 



In [8]:
result = retriever.get_relevant_documents("How much were Google's R&D expenses between 2008 and 2010??")

response = llm.predict(prompt.format(context =result, question="How much were Google's R&D expenses between 2008 and 2010??"))
print(response)

I can't answer this question. The provided documents only give R&D expenses for the nine months ending in September 2007 and 2008 and then for the six months ending June 2008-2010. To answer the question, I would need the R&D expenses for the remaining months between October 2007-December 2010. 




### Create your own RAG

In [9]:
loader = GCSDirectoryLoader(project_name="TODO", bucket="TODO")
docs = loader.load()

The PDF <_io.BufferedReader name='/var/tmp/tmpszhwuz1w/2004Q4_earnings_google.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/var/tmp/tmp6cepl8pp/2004_google_annual_report.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/var/tmp/tmpm0ylk3f7/20051231_10-K.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/var/tmp/tmp4r64nkcx/2005Q1_earnings_google.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

print (f"Your {len(docs)} documents have been split into {len(splits)} chunks")

Your 176 documents have been split into 24312 chunks


In [11]:
if 'vectordb' in globals(): # If you've already made your vectordb this will delete it so you start fresh
    vectordb.delete_collection()

embedding = VertexAIEmbeddings(model_name="textembedding-gecko@latest")
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

In [12]:
def create_answer(llm, context, question):
    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    return llm.predict(text=PROMPT.format_prompt(
        context=context,
        question=question
    ).text)

In [13]:
llm = ChatVertexAI(
          model_name='gemini-1.5-pro-001',
          temperature=0,
          max_output_tokens=2048
        )

question = "How much were Google's R&D expenses in 2004?"

retriever = vectordb.as_retriever(search_kwargs={"k": 1})
context = retriever.invoke(question)
print(context)
create_answer(llm=llm, context=context, question=question)

  warn_deprecated(
  warn_deprecated(


[Document(page_content='Research and development expenses increased by $16.4 million to $95.8 million (or 6.9% of revenues) in the three months\n\nended June 30, 2005, from $79.4 million (or 6.3% of revenues) in the three months ended March 31, 2005. This increase was primarily due to an increase in labor and facilities related costs of $11.8 million as a result of a 27% increase in\n\n20\n\nGOOGLE INC. 06/30/2005 FORM 10-\n\nRR Donnelley ProFile\n\nPALFBU-2KP-PF01 9.0.16\n\nPAL vaugm0pa PAL\n\nˆ1T3LQP4X4WM2L5BfŠ 2* 0C\n\n1T3LQP4X4WM2L5B\n\n13-Aug-2005 03:45 EST\n\n92647 TX 21 HTM ESS Page 1 of 1\n\nresearch and development headcount. In addition, depreciation and related expenses increased by $3.5 million primarily as a result of additional information technology assets purchased over the six months ended June 30, 2005.\n\nResearch and development expenses increased by $50.0 million to $95.8 million (or 6.9% of revenues) in the three months ended June 30, 2005, from $45.8 million (or 

'$45.8 million \n'

## MultiQueryRetriever

There are often scenarios where your input query does not capture the semantics of the data well.

1. Multi Query Retriever overcomes by generating multiple queries from different perspectives for an input query
2. It then retrieves all the relevant documents based on all the generated queries to get a larger set of potentially relevant documents
3. We perform deduplication of all documents and these documents are then passed as a context to the LLM to generate an answer.


![title](multiquery.png)

In [14]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [15]:
def showMultiQueryRetriever(question):

  llm = ChatVertexAI(
          model_name='gemini-1.5-pro-001',
          temperature=0.4,
          max_output_tokens=2048
        )

  retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectordb.as_retriever(), llm=llm)
  context = retriever_from_llm.invoke(question)
  return create_answer(llm=llm, context=context, question=question)

In [16]:
showMultiQueryRetriever("""How much were Google R&D expenses between 2008 and 2010?""")

INFO:langchain.retrievers.multi_query:Generated queries: ["1. What was Google's total research and development spending from 2008 to 2010? ", '2. How much did Google invest in R&D annually during the period of 2008-2010?', "3. What were the yearly figures for Google's research and development expenditures in the years 2008, 2009, and 2010?"]


'$8.4 billion \n'