Required python packages

In [None]:
%pip install -q google-generativeai ipywidgets

Import Environment variables

In [None]:
import os
import getpass
import dotenv
from dotenv import load_dotenv
load_dotenv()

Import app packages

In [None]:
import google.generativeai as palm

In [None]:
palm.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
models = [
    m for m in palm.list_models() if "generateText" in m.supported_generation_methods
]

for m in models:
    print(f"Model Name: {m.name}")

PART I

Embeddings and Vector Stores

In [None]:
%pip install pypdf
%pip install faiss-cpu
%pip install -U langchain-google-genai

In [None]:
import langchain
import langchain_experimental
import google.generativeai
import pypdf

In [None]:
from langchain.llms import google_palm
from langchain.vectorstores import FAISS
from langchain.embeddings import GooglePalmEmbeddings
from langchain.llms import GooglePalm
from langchain.document_loaders import (PyPDFLoader, DataFrameLoader)
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

Split a single pdf into pieces

In [None]:
# define pdf document using pypdfloader
loader = PyPDFLoader("./data/City of Costa Mesa RFP Final- Posted.pdf")
# split each pdf page into a separate document and load into memory
pages = loader.load_and_split()

In [None]:
# Count the numbe of pages split by the splitter
len(pages)

In [None]:
# Show a specific page
pages[7]

Initialize Local Vectordb Loader

In [None]:
# Define Embeddings source
embeddings=GooglePalmEmbeddings(model_name="models/embedding-gecko-001",google_api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
# Create local vector db if doesn't already exist
def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")

In [None]:
# Test local vector db
def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn)
  test_dict = test_index.docstore._dict
  return len(test_dict.values())  

Prepare Document Chunk Splitters

In [None]:
def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the  
  splits so for further processing can be done."""
  
  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()  

  textSplit = RecursiveCharacterTextSplitter(chunk_size=1000,
                                             chunk_overlap=200,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content)
    doc_list.extend(pg_splits)

  return doc_list

Load split document chunks into local Vectordb

In [None]:
pdf_docs = get_pdf_splits("./data/City of Costa Mesa RFP Final- Posted.pdf")

embed_index(doc_list=pdf_docs,
            embed_fn=embeddings,
            index_store='./vectorstore/CMRFP_index')

In [None]:
print(pdf_docs)

Check chunk split page content and metadata

In [None]:
pages[0]

Check Local Vectordb size

In [None]:
get_docs_length(index_path="./vectorstore/CMRFP_index",
                embed_fn=embeddings)

Test in-memory FAISS db

In [None]:
faiss_index = FAISS.from_documents(pages, GooglePalmEmbeddings(model_name="models/embedding-gecko-001",google_api_key=os.getenv("GOOGLE_API_KEY")))

In [None]:
docs = faiss_index.similarity_search("what are the duties of the program manager", k=5)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

Define FAISS db Local Index

In [None]:
faiss_localindex = FAISS.load_local("./vectorstore/CMRFP_index",embeddings)

Test Local FAISS db

In [None]:
docs = faiss_localindex.similarity_search_with_relevance_scores("what does phase 2 deployment entail?", k=5)

In [None]:
print(docs)

In [None]:
docs = faiss_localindex.similarity_search("what does phase 2 deployment entail?", k=5)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

PART III

Implement RAG Fusion

In [None]:
%pip install -U langsmith langchainhub
%pip install tiktoken

In [None]:
import openai
import langchain_experimental
import google.generativeai as palm
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()

In [None]:
from langchain.llms import GooglePalm
from langchain.llms import openai
from langchain.vectorstores import FAISS
from langchain.embeddings import GooglePalmEmbeddings
from langchain.llms import GooglePalm
from langchain.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from langchain.callbacks.manager import tracing_v2_enabled
from langsmith import Client
client = Client()

LANGCHAIN_TRACING_V2="true"
LANGCHAIN_API_KEY="ls__c373b2a75ddb472dabcb4e8b2818ad36"
LANGCHAIN_PROJECT="palm-ragfusion-shadow-99"  # if not specified, defaults to "default"
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY") 

In [None]:
retriever = faiss_localindex.as_retriever(search_type="mmr", search_kwargs={'k': 10, 'lambda_mult': 0.25})

In [None]:
llm = OpenAI(api_key=OPENAI_API_KEY)

Prepare RAG Fusion Chatbot

In [None]:
#source: https://github.com/shivanshkaushikk/rag-fusion/blob/main/RAG-fusion.ipynb

def generate_queries_chatgpt(original_query):
    response = llm.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "Your task is to always think step by step to generate multiple different document search queries that aim to answer the user question from multiple perspectives. Each query MUST tackle the question from a different viewpoint, dissect the sentence clauses to infer question intent, we want to get a variety of RELEVANT search results. Each query MUST be in one line and one line only. You SHOULD NOT include any preamble or explanations, and you SHOULD NOT answer the questions or add anything else, just geenrate the queries."},
            {"role": "user", "content": f"Generate multiple search queries related to: {original_query}"},
            {"role": "user", "content": "OUTPUT (5 queries):"}
        ]
    )

    generated_queries = response.choices[0].message.content.strip().split("\n")
    return generated_queries

def vector_search(query):
    search_results = {}
    retrieved_docs = retriever.get_relevant_documents(query)
    for i in retrieved_docs:
        search_results[i.page_content] = i.metadata=1
    return search_results

def reciprocal_rank_fusion(search_results_dict, k=60):
    fused_scores = {}
        
    for query, doc_scores in search_results_dict.items():
        
        for rank, (doc, score) in enumerate(sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)):
            if doc not in fused_scores:
                fused_scores[doc] = 0
            previous_score = fused_scores[doc]
            fused_scores[doc] += 1 / (rank + k)
            print(f"Updating score for {doc} from {previous_score} to {fused_scores[doc]} based on rank {rank} in query '{query}'")

    reranked_results = {doc: score for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}
    print("Final reranked results:", reranked_results)
    return reranked_results

def generate_output(original_query, reranked_results):
    reranked_docs = [i for i in reranked_results.keys()]
    context = '\n'.join(reranked_docs)
    response = llm.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers user's questions based on the context provided. You always format answers in paragraphs, outlines, or tables when appropriate.\nDo not make up an answer if you do not know it, stay within the bounds of the context provided, if you don't know the answer, say that you don't have enough information on the topic!"},
            {"role": "user", "content": f"CONTEXT: {context}\nQUERY: {original_query}"},
            {"role": "user", "content": "ANSWER:"}
        ]
    )

    response = response.choices[0].message.content.strip()
    return response

In [None]:
original_query = "why does the City of costa mesa need a Enterprise Resource Planning System, are they having any issues or pain points with their current syste? Do they currently have an Enterprise Resource Planning System?"
generated_queries = generate_queries_chatgpt(original_query)

all_results = {}
for query in generated_queries:
    search_results = vector_search(query)
    all_results[query] = search_results

reranked_result = reciprocal_rank_fusion(all_results)
final_output = generate_output(original_query, reranked_result)

In [None]:
print(generated_queries)

In [None]:
final_output

PART IV: 

CSV Custom Agent

In [None]:
import pandas as pd
from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
from langchain.agents.agent_toolkits.conversational_retrieval.tool import (
    create_retriever_tool,
)
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain.tools import PythonAstREPLTool
from langchain.vectorstores import FAISS
from langsmith import Client
from pydantic import BaseModel, Field

In [None]:
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 20)

In [None]:
# Define Embeddings source
embeddings=GooglePalmEmbeddings(model_name="models/embedding-gecko-001",google_api_key=os.getenv("GOOGLE_API_KEY"))