In [None]:
import os
import re
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
import chromadb

In [None]:
load_dotenv()

![Diagram](./images/Basic_Rag.png)

# Data Extraction 

The first step in any RAG pipeline is to get the data you want to work with. In our case, we'll be extracting the text content from the Anthropic news website.

This code block defines a function bs4_extractor that uses BeautifulSoup to parse the HTML content of a webpage and extract the text. The RecursiveUrlLoader then uses this function to load the content from the specified URL. We set max_depth=2 to limit how deep the scraper will go into the website's links.

In [None]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip() #parse wbpage text

# ref: https://python.langchain.com/docs/integrations/document_loaders/
loader = RecursiveUrlLoader("https://www.anthropic.com/news", extractor=bs4_extractor, max_depth=2)

docs = loader.load()

In [None]:
len(docs) # Number of documents loaded from the Anthropic News site

In [None]:
# Let's inspect the metadata of one of the loaded documents. 
# The metadata provides useful information about the source of the document, such as the URL, title, and description.

# docs[1].page_content
docs[10].metadata   

In [None]:
# To get a better sense of the content we've loaded, 
# this code block defines a helper function wrap_text to format the text and then prints the content of each document.

def wrap_text(text, width=80):
    return '\n'.join([text[i:i+width] for i in range(0, len(text), width)]) 

for doc in docs:
    print(wrap_text(doc.page_content))
    print("-"*100)

# Chunk the data

Now that we have our documents, the next step is to split them into smaller chunks. This is important for a few reasons:

__Vector search efficiency__: Smaller chunks are easier to search and retrieve.

__Context window limitations__: LLMs have a limited context window, so we need to make sure the retrieved information fits within that window.

__Relevance__: Smaller chunks are more likely to be focused on a specific topic, which improves the relevance of the retrieved information.

We'll use the `RecursiveCharacterTextSplitter` to split our documents. This splitter tries to split text on a series of characters (like newlines, spaces, etc.) in a recursive manner.

- `chunk_size=1000`: This sets the maximum size of each chunk to 1000 characters.

- `chunk_overlap=200`: This creates an overlap of 200 characters between consecutive chunks. This helps to ensure that we don't lose any important context at the boundaries of our chunks.

- `add_start_index=True`: This will add the starting index of the chunk in the original document to the metadata.

In [None]:
# chunk the data

# ref: https://python.langchain.com/docs/concepts/text_splitters/
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

splits = text_splitter.split_documents(docs) 

In [None]:
for split in splits:
    print(wrap_text(split.page_content))
    print("-"*100)

In [None]:
splits[2].metadata

# Indexing

Now that we have our document chunks, we need to create an index that we can search. We'll use a vector store for this, which allows us to perform semantic search on our documents.

In this block, we're setting up our vector store using ChromaDB and Google's Generative AI embeddings.

- `embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")`: This initializes the embedding model that will be used to convert our document chunks into numerical vectors.

- `vector_store = Chroma(...)`: This creates a ChromaDB vector store.

- `collection_name`: A name for our collection of documents.

- `embedding_function`: The embedding model to use.

- `persist_directory`: The directory where the vector store data will be saved locally.

In [None]:
#Indexing

#define the embeddings model
#ref: https://python.langchain.com/docs/integrations/text_embedding/
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#disable telemetry
client_settings = chromadb.config.Settings(
    persist_directory="./chroma_db",
    anonymized_telemetry=False,  # Disables telemetry
)

#define the vector store
#ref: https://python.langchain.com/docs/concepts/vectorstores/
vector_store = Chroma(
    collection_name="1_basic_rag_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",  # Where to save data locally
    client_settings=client_settings
)

# vector_store.delete_collection()

In [None]:
#Now, we add our document chunks to the vector store. 
# This process will convert each chunk into a vector and store it in the database.

document_ids = vector_store.add_documents(documents=splits)
document_ids

In [None]:
# Let's retrieve a document from the vector store by its ID to confirm that it has been indexed correctly.
vector_store.get_by_ids([document_ids[1]])

# Retrieval

With our documents indexed, we can now perform retrieval. The goal of this step is to find the most relevant document chunks for a given user question.

First, we'll set up our LLM and a prompt template.

- `llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")`: We'll use the Gemini 1.5 Flash model for generation.

- `template = ...`: This is the prompt template that we'll use to combine the retrieved context with the user's question. The {context} and {question} are placeholders that will be filled in later.

In [None]:
#configure the llm
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")  # web is search disabled by default

#set the prompt template
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""

rag_prompt_template = PromptTemplate.from_template(template)


In [None]:
# Let's define a sample user question.
user_question = "What is anthropic?"

In [None]:
# Now, we'll use the vector store's similarity_search method to find the top 5 most similar documents to the user's question.
retrieved_docs = vector_store.similarity_search(user_question, k=5)

In [None]:
#review the retreived docs and see how relevant they are
for doc in retrieved_docs:
    print(doc.page_content)
    print("-"*100)

In [None]:
#inspect metadata
retrieved_docs[0].metadata

In [None]:
# The similarity_search_with_score method returns the documents along with their similarity scores.
vector_store.similarity_search_with_score(user_question, k=5)

# Generation

The final step is to use the retrieved documents to generate an answer to the user's question.

We'll combine the content of the retrieved documents and use our prompt template to create a final prompt for the LLM.


In [None]:
#generate answer

docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
prompt = rag_prompt_template.invoke({"question": user_question, "context": docs_content})
response = llm.invoke(prompt)

In [None]:
#generated response
response.content

In [None]:
# We can also include citations in our response by extracting the source from the metadata of the retrieved documents.

sources = [doc.metadata["source"] for doc in retrieved_docs]

print(f"Sources: {sources}\n\n")
print(f'Answer: {response.content}')

### Langchain Retreiver

LangChain provides a `Retriever` interface, which is a more general way to retrieve documents. A vector store can be used as the backbone of a retriever, but there are other types of retrievers as well.

Here, we're creating a retriever from our vector store. We can also specify search arguments like k (the number of documents to retrieve) and search_type.

In [None]:
# Ref: https://python.langchain.com/docs/concepts/retrievers/
retriever = vector_store.as_retriever(search_kwargs={"k": 100}, search_type='similarity')

retrieved_docs = retriever.invoke(user_question)
retrieved_docs


for doc in retrieved_docs:
    print(doc.page_content)
    print("-"*100)


In [None]:
# Now, let's put it all together in a single function.
def generate_answer(user_question):
    #retrieve the relevant docs
    retriever = vector_store.as_retriever(search_kwargs={"k": 100}, search_type='similarity')
    retrieved_docs = retriever.invoke(user_question)
    
    #generate
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    prompt = rag_prompt_template.invoke({"question": user_question, "context": docs_content})
    response = llm.invoke(prompt)

    return response.content

user_question = "What is Anthropic?"
generate_answer(user_question)