I'm using Langchain 1.0 (v.1.0.5) implementation of RAG pipeline

Also, I'll use `all-MiniLM-L6-v2` embedding model.

In [2]:
# Import all libraries

import glob
import os

import numpy as np
import plotly.graph_objects as go
import tiktoken
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE

In [3]:
# API price is a significant factor in this project, so I'm going to use 'gpt-4.1-nano' model

MODEL = "gpt-4.1-nano"
db_name = "vector_db"
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins sk-proj-


### Analyze Documents

In [4]:
# How many characters in all the documents.

knowledge_base_path = "../knowledge_base/**/*.md"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base")

entire_knowledge_base = ""

for file_path in files:
    with open(file_path, 'r', encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

Found 31 files in the knowledge base
Total characters in knowledge base: 63,146


In [5]:
# How many tokens in all the documents

encoding = tiktoken.encoding_for_model(MODEL)
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)
print(f"Total tokens for {MODEL}: {token_count:,}")

Total tokens for gpt-4.1-nano: 13,397


In [6]:
# Load in everything in the knowledge base using Langchain's loaders

folders = glob.glob("../knowledge_base/*")

documents = []

for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(
        folder,
        glob="**/*.md",
        loader_cls=TextLoader,
        loader_kwargs={"encoding": "utf-8"}
    )
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents from the knowledge base")

Loaded 31 documents from the knowledge base


In [7]:
documents[12]

Document(metadata={'source': '..\\knowledge_base\\contracts\\OEM Partnership Agreement with Quantum Leap AI for CoreLLM API.md', 'doc_type': 'contracts'}, page_content='# OEM Partnership and Licensing Agreement with Quantum Leap AI\n\n**Agreement ID:** OEM-QLAI-2024-002\n**Parties:**\n- **Licensor:** Innovatech Solutions Inc.\n- **Licensee:** Quantum Leap AI Inc.\n**Effective Date:** March 1, 2024\n**Term:** Five (5) years.\n\n## 1. Background\nQuantum Leap AI wishes to embed Licensor\'s advanced AI capabilities into its flagship "Cognitive Suite" platform. This agreement grants Quantum Leap AI the right to license and integrate Innovatech\'s **CoreLLM** API as a "Powered by Innovatech" feature.\n\n## 2. Royalties and Financials\n**2.1. Royalty Fee:** Licensee will pay Licensor a royalty fee equal to **twenty-five percent (25%)** of the Net Revenue received by Licensee from sales of the specific SKU of Cognitive Suite that includes the integrated CoreLLM functionality.\n**2.2. Minimum 

In [8]:
documents[12].metadata

{'source': '..\\knowledge_base\\contracts\\OEM Partnership Agreement with Quantum Leap AI for CoreLLM API.md',
 'doc_type': 'contracts'}

In [9]:
documents[12].page_content

'# OEM Partnership and Licensing Agreement with Quantum Leap AI\n\n**Agreement ID:** OEM-QLAI-2024-002\n**Parties:**\n- **Licensor:** Innovatech Solutions Inc.\n- **Licensee:** Quantum Leap AI Inc.\n**Effective Date:** March 1, 2024\n**Term:** Five (5) years.\n\n## 1. Background\nQuantum Leap AI wishes to embed Licensor\'s advanced AI capabilities into its flagship "Cognitive Suite" platform. This agreement grants Quantum Leap AI the right to license and integrate Innovatech\'s **CoreLLM** API as a "Powered by Innovatech" feature.\n\n## 2. Royalties and Financials\n**2.1. Royalty Fee:** Licensee will pay Licensor a royalty fee equal to **twenty-five percent (25%)** of the Net Revenue received by Licensee from sales of the specific SKU of Cognitive Suite that includes the integrated CoreLLM functionality.\n**2.2. Minimum Guarantee:** Licensee commits to a minimum annual royalty payment of **Two Hundred Fifty Thousand US Dollars ($250,000.00)** for each year of the term, payable quarterl

In [10]:
# Divide into chunks using the RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Split {len(documents)} documents into {len(chunks)} chunks")
print(f"First chunks:\n\n{chunks[0]}")

Split 31 documents into 90 chunks
First chunks:

page_content='# About Innovatech Solutions: Our Story and Mission

## Our Story: From Garage to Global SaaS Leader

Innovatech Solutions was born in a Palo Alto garage in 2015 from a simple but powerful observation by our founders, Jane Doe and John Smith. While working at large tech companies, they saw brilliant teams bogged down by repetitive manual tasks, disjointed software, and data that was impossible to access without a team of analysts. They envisioned a future where intelligent software could act as a central nervous system for a business, automating workflows and making data-driven insights accessible to everyone.' metadata={'source': '..\\knowledge_base\\company\\about.md', 'doc_type': 'company'}


In [11]:
chunks[16]

Document(metadata={'source': '..\\knowledge_base\\company\\overview.md', 'doc_type': 'company'}, page_content='## 4. Corporate Information\n- **Legal Name:** Innovatech Solutions Inc.\n- **Type:** Delaware C-Corporation\n- **Key Investors:** Sequoia Capital, Andreessen Horowitz, Insight Partners\n- **Primary Auditor:** Deloitte\n- **Primary Legal Counsel:** Wilson Sonsini Goodrich & Rosati')

### Making vectors and store in Vector Database (I'm going to use Chroma)

In [12]:
# Choose embedding model

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embedding_model).delete_collection()

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=db_name
)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 90 documents


In [13]:
# Investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector database")

There are 90 vectors with 384 dimensions in the vector database


## Visualize

In [14]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [17]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [16]:
# 3D visualization

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()