## Embeddings Testing Notebook

### Part 1: SETUP

In [2]:
import os
import glob
import tiktoken
import numpy as np
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
# Define the model you want to work with
MODEL = "gpt-4.1-nano"

# Define the name of the vector database
db_name = "vector_db"

# Import env variables and API Keys
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY_EMBEDDINGS')
if openai_api_key:
    print(f"OpenAI (Embeddings) API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI (Embeddings) API Key not set")

OpenAI (Embeddings) API Key exists and begins sk-proj-


### Part 2: Divide the documents into chunks

Check how many characters there are in all the documents

In [31]:
# Define the folder and its structure to extract data from
folder_and_structure = "knowledge-base/**/*.md"

knowledge_base_path = f"../../data/{folder_and_structure}"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base")

entire_knowledge_base = ""

for file_path in files:
    with open(file_path, 'r', encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

Found 76 files in the knowledge base
Total characters in knowledge base: 304,434


Check how many tokens there are in all the documents. (NOTE: This varies by Model)

In [32]:
encoding = tiktoken.encoding_for_model(MODEL)
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)
print(f"Total tokens for {MODEL}: {token_count:,}")

Total tokens for gpt-4.1-nano: 63,555


# NOT FINISHED

Load in everything in the knowledgebase using LangChain's loaders

TIP: Use different document loaders to see which produces better embedding quality

In [None]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")