# RAG 

#### A low cost agent using RAD (Retrieval Augmented Generation) to show how question/answering assistant can be done with high accracy

In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [2]:
# Picking a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [3]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [4]:
# This script loads all Markdown (.md) files from subfolders within the "knowledge-base" directory.
# Each subfolder represents a different document type (extracted from the folder name).
# It uses LangChain's DirectoryLoader to read the files, attaches metadata indicating the document type,
# and stores all loaded documents in the 'documents' list for further processing.

folders = glob.glob("knowledge-base/*")
text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [5]:
# Splitting the loaded documents into smaller chunks to make them suitable for processing (e.g., embedding or indexing).
# It uses a CharacterTextSplitter to divide the text into chunks of up to 1000 characters,
# with an overlap of 200 characters between consecutive chunks to preserve context across splits.

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 1088, which is longer than the specified 1000


In [6]:
len(chunks)

123

In [7]:
# Extracts all unique document types from the metadata of the text chunks.
# It then prints a summary of the different types of documents that were processed,
# which helps in understanding the composition of the dataset.
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: company, contracts, employees, products


## Embeddings and Auto-Encoding LLMs

So, here’s what I’m doing: I’m taking each chunk of text and converting it into a vector that captures its meaning—this process is called embedding.

To handle this, I’m using an embedding model provided by OpenAI. I’ll access it through their API, using some LangChain code to keep things simple.

This particular model falls under the category of "Auto-Encoding LLMs." These models work by processing the entire input at once to produce an output. That’s a bit different from the "Auto-Regressive LLMs" we’ve mostly been talking about—those generate new text by predicting the next word based only on the previous ones.


In [8]:
# Create an embedding function using OpenAI's model.
# This will convert text chunks into vector representations.
embeddings = OpenAIEmbeddings()

# If a Chroma vectorstore already exists at the specified directory,
# we delete the existing collection to avoid duplication or conflicts.
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create a new Chroma vectorstore from the previously split document chunks.
# The embeddings are used to convert text into vectors, which are then stored persistently.
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

# Print out the number of documents that were embedded and stored in the vectorstore.
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

  Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()


Vectorstore created with 123 documents


In [9]:
# Inspect a sample embedding from the Chroma vectorstore to determine the dimensionality of the vectors.

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


## Exploring the Vector Store

Before moving on, let’s take a moment to examine the documents and their embedding vectors to get a better sense of how they’re structured.

In [10]:
# Retrieve embeddings, original documents, and metadata from the vector store.
# Convert embeddings to a NumPy array and assign a color to each document based on its type.

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [11]:
# Use t-SNE to reduce high-dimensional embeddings to 2D for visualization.
# Plot the result with Plotly, coloring points by document type and adding text previews for context.

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [12]:
# Reduce embeddings to 3D using t-SNE for a more detailed visualization.
# Plot the 3D scatter using Plotly, coloring points by document type and showing text previews on hover.

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

# Bringing It All Together with LangChain

Now that we’ve processed and visualized our documents, it’s time to connect everything using LangChain. 
We’ll tie together embeddings, vector storage, and retrieval to enable powerful question-answering over our dataset.


In [13]:
# Set up a conversational retrieval chain using LangChain.
# This connects the LLM with a retriever (based on our vector store) and a memory buffer to maintain chat history.

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [14]:
# Send a user query through the conversational retrieval chain.
# The LLM uses retrieved context from the vector store and conversation history to generate a response.

query = "Can you describe Insurellm in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

Insurellm is an innovative insurance tech startup founded by Avery Lancaster in 2015, aimed at disrupting the insurance industry with cutting-edge products. With a workforce of 200 employees and 12 offices across the US, Insurellm offers four main software products: Carllm for auto insurance, Homellm for home insurance, Rellm for the reinsurance sector, and Marketllm, a marketplace connecting consumers with insurance providers. The company has rapidly expanded, serving over 300 clients worldwide and focusing on delivering reliable and transformative solutions in the insurance landscape.


In [15]:
# Reinitialize the conversation chain with a fresh memory buffer.
# This clears previous interactions, starting a new conversational session with the LLM and retriever.

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Launching a Chat Interface with Gradio

We’ll now spin up a simple Gradio chat interface to interact with our LLM-powered retrieval system.  
It’s a fast and intuitive way to prototype and test conversational experiences.

In [16]:
# Define a chat function for Gradio that passes user messages to the conversation chain.
# Launch a simple Gradio chat interface in the browser to interact with the LLM in real time.

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


## Swapping Chroma for FAISS

Now we’ll switch our vector store backend from Chroma to FAISS—Facebook’s AI Similarity Search library.  
FAISS is optimized for efficient similarity search and clustering, especially with large-scale vector datasets.

In [17]:
from langchain.vectorstores import FAISS

In [18]:
# Initialize FAISS vector store with document chunks and OpenAI embeddings.
# Retrieve and print the total number of vectors and their dimensionality from the FAISS index.

embeddings = OpenAIEmbeddings()

vectorstore = FAISS.from_documents(chunks, embedding=embeddings)

total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d

print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vector store")

There are 123 vectors with 1,536 dimensions in the vector store


In [19]:
# Reconstruct each embedding vector from the FAISS index and retrieve its associated document.
# Collect the document text, metadata (doc_type), and assign a color for visualization.

vectors = []
documents = []
doc_types = []
colors = []
color_map = {'products':'blue', 'employees':'green', 'contracts':'red', 'company':'orange'}

for i in range(total_vectors):
    vectors.append(vectorstore.index.reconstruct(i))
    doc_id = vectorstore.index_to_docstore_id[i]
    document = vectorstore.docstore.search(doc_id)
    documents.append(document.page_content)
    doc_type = document.metadata['doc_type']
    doc_types.append(doc_type)
    colors.append(color_map[doc_type])
    
vectors = np.array(vectors)

## Exploring the Vector Space

Let’s take a moment to visualize the documents alongside their embedding vectors to better understand the structure.

(Just a quick aside: what you’re seeing here is the spatial distribution of vectors produced by OpenAI’s embedding model. Since these vectors are the same regardless of whether they’re stored in FAISS or Chroma, the visual output remains essentially unchanged.)


In [20]:
# Use t-SNE to reduce FAISS vectors to 2D and visualize them with Plotly.
# Points are colored by document type and include a preview of text content on hover.

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D FAISS Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [21]:
# Apply t-SNE to reduce FAISS vectors to 3D and create an interactive scatter plot with Plotly.
# Each point represents a document, colored by type and annotated with a text snippet.

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D FAISS Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## Time to use LangChain

In [22]:
# Set up a conversational retrieval chain using FAISS as the retriever backend.
# Initialize the LLM, conversation memory, and link everything into a LangChain pipeline.
# Send a query to the chain and print the LLM's response, using relevant documents retrieved from FAISS.

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

query = "Can you describe Insurellm in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

Insurellm is an innovative insurance tech startup founded by Avery Lancaster in 2015, designed to disrupt the insurance industry with cutting-edge products. The company has grown to 200 employees and operates 12 offices across the US, offering four main software products: Carllm for auto insurance, Homellm for home insurance, Rellm for the reinsurance sector, and Marketllm, a marketplace connecting consumers with insurance providers. With over 300 clients worldwide, Insurellm is committed to transforming the insurance landscape through innovation and reliability.


In [23]:
# Reset the conversation chain with a fresh memory buffer to start a new dialogue session.

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Launching a Gradio Chat Interface

Next, we'll use Gradio to quickly set up a chat interface for our LLM.

In [24]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

view = gr.ChatInterface(chat).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
