In [1]:
!pip install langchain --upgrade
!pip install -q google-generativeai
!pip install google-cloud-aiplatform

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as palm

## Assemble the constants

In [3]:
PALM_API_KEY = "<INSERT_YOUR_GOOGLE_PALM_API_KEY>"
PINECONE_API_KEY = "<INSERT_YOUR_PINECONE_API_KEY>"
PINECONE_API_ENV = "<INSERT_YOUR_PINECONE_ENVIRONMENT>"
PINECONE_INDEX_NAME = "<INSERT_YOUR_PINECONE_INDEX>"
PROJECT_ID = "<INSERT_YOUR_GOOGLE_PROJECT_ID>"

## Configurations

In [4]:
palm.configure(api_key=PALM_API_KEY)

In [5]:
models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]

In [2]:
model = models[0].name
print(model)

In [7]:
## Loading Document data
loader = PyPDFLoader("./data/FinancialML.pdf")
data = loader.load()

In [3]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

## Chunk your data up into smaller documents

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [4]:
print (f'Now you have {len(texts)} documents')

## Create embeddings of your documents to get ready for semantic search

In [6]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings import VertexAIEmbeddings
import pinecone
from google.cloud import aiplatform

In [12]:
aiplatform.init(project=PROJECT_ID)

In [13]:
embeddings = VertexAIEmbeddings()

In [14]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)
index_name = PINECONE_INDEX_NAME

In [15]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

## Implement search over the document

In [16]:
query = "What is a Penalized Leniar Model?"
docs = docsearch.similarity_search(query)

## Get the answer from the VertexAI in a formatted form

### Utility Functions

In [17]:
def generate_prompt(input_query, context):
    prompt = f"""Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
        {context}
        Query: {input_query}
        Helpful Answer:
        """
    return prompt

def generate_text_with_vertex_ai(prompt):
    completion = palm.generate_text(
        model='models/text-bison-001',
        prompt=prompt,
        temperature=0,
        max_output_tokens=800,
    )
    return completion.result

def create_context_using_document(docs):
    context_string = ""
    for doc in docs:
        context_string += doc.page_content + "\n\n"
    return context_string

In [18]:
context_string = create_context_using_document(docs)

In [19]:
formatted_prompt = generate_prompt(query, context_string)

In [20]:
llm_answer = generate_text_with_vertex_ai(formatted_prompt)

In [21]:
print("Bot aswer: ", llm_answer)

Bot aswer:  A penalized linear model is a linear regression model that has been penalized to reduce overfitting.
