# Expert Knowledge Assistant

### A question-answering system designed to serve as a highly capable knowledge worker. Built for employees at Insurellm, an Insurance Technology company. Prioritizes accuracy while maintaining a cost-effective implementation

This project leverages Retrieval-Augmented Generation (RAG) to deliver precise answers and reliable performance.


In [2]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [7]:
# imports for langchain and Chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
# Low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [9]:
# Load environment variables

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [10]:
# Load all markdown files from subfolders in "knowledge-base" and tag each with its folder name

folders = glob.glob("knowledge-base/*")
text_loader_kwargs = {'encoding': 'utf-8'}
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [11]:
# Split documents into chunks of 1000 characters with 200-character overlap

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 1088, which is longer than the specified 1000


In [12]:
len(chunks)

123

In [13]:
# Print unique document types found in the chunks

doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: contracts, company, products, employees


## Embeddings and "Auto-Encoding LLMs"

In this step, I’ll be converting each chunk of text into a vector that captures its meaning — embedding.

To do this, I’m using OpenAI’s embedding model via their API, integrated through LangChain. This particular model falls under the category of "Auto-Encoding LLMs" — meaning it processes the entire input to produce an output all at once.


In [14]:
# Initialize OpenAI's embedding model

embeddings = OpenAIEmbeddings()

In [15]:
# Delete existing Chroma collection if the database directory already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [16]:
# Create a Chroma vectorstore from the document chunks and print the total count

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 123 documents


In [17]:
# Get the dimensionality of the stored embedding vectors

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


## Exploring the Vector Store

Taking a moment to inspect the documents and their embeddings to understand how they're represented.

In [18]:
# Retrieve embeddings, documents, and metadata from the collection, then extract doc types and assign colors for visualization

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [26]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, perplexity=30, max_iter=1000, random_state=100)
reduced_vectors = tsne.fit_transform(vectors)

# Create a richer 2D scatter plot with document info and type-based coloring

import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(
        size=7,
        color=colors,
        opacity=0.85,
        line=dict(width=0.5, color='black')
    ),
    text=[f"<b>Type:</b> {t}<br><b>Preview:</b> {d[:200]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='t-SNE Projection of Chroma Vector Store',
    xaxis=dict(title='t-SNE X', showgrid=True),
    yaxis=dict(title='t-SNE Y', showgrid=True),
    width=900,
    height=700,
    margin=dict(l=50, r=50, t=70, b=50),
    plot_bgcolor='rgba(245, 245, 245, 1)'
)

fig.show()

In [27]:
# Model in 3D version

tsne = TSNE(n_components=3, perplexity=30, max_iter=1000, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Build the 3D scatter plot using Plotly

import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(
        size=6,
        color=colors,
        opacity=0.85,
        line=dict(width=0.5, color='black')
    ),
    text=[f"<b>Type:</b> {t}<br><b>Preview:</b> {d[:200]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Visualization of Chroma Vector Embeddings',
    scene=dict(
        xaxis=dict(title='t-SNE X'),
        yaxis=dict(title='t-SNE Y'),
        zaxis=dict(title='t-SNE Z'),
        bgcolor='rgba(255,255,255,0.95)'
    ),
    width=950,
    height=750,
    margin=dict(l=40, r=40, b=40, t=80)
)

fig.show()