# Information
1. This files create a dense vectors (embeddings) from a group of documents
2. Stores them in a vector database
3. Executes simple test to see if able to be used in conjuction with a LLM

Defaults:
1. Filetype: .pdf
2. Models: text-embedding-3-small & gpt-4-mini

# Config

In [2]:
#config
import os
# FILE_PATH_TO_EMBED = os.path.join("pension-martijn-files", "Kader Datakwaliteit - wet toekomst pensioenen.pdf.pdf")
DIRECTORY_TO_EMBED = os.path.join("pension-martijn-files")

COLLECTION_NAME = "DATA_QUALITY_PENSION_TEST-UV" #name of the collection
PERSIST_DIRECTORY = os.path.join("vector_stores", "pension-martijn-embeddings")

#os.path.join( "solvency-II-files", "solvency II - level 1 - v2.pdf")

# Embedding Model

In [3]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings


# Set your Azure OpenAI credentials
embedding_api_key = os.getenv("AZURE_OPENAI_API_KEY")
# print(api_key)

# Create an AzureOpenAIEmbeddings object
# embedding_model = AzureOpenAIEmbeddings(
#     model="text-embedding-3-small",
#     azure_endpoint="https://openai-playground-bjorn.openai.azure.com/",
#     api_version="2023-05-15",
#     api_key=embedding_api_key
# )

# model = "gemini-embedding-exp-03-07"
model = "text-embedding-004"

# from functools import partial

embedding_model = GoogleGenerativeAIEmbeddings(
        model=f"models/{model}", 
        google_api_key=os.environ["GOOGLE_API_KEY"],
        request_options={"initial": 70, "multiplier": 2}
    )   

#overwrite default batch_size of 100 for rate limit (google)
# embedding_model.embed_documents = partial(embedding_model.embed_documents, batch_size=5)

# test out the embedding:

# Create embeddings for some text
text = "I love learning about computers!"
text_embeddings = embedding_model.embed_query(text)

# Print the embeddings
print(text_embeddings)
texts = ["Hello, world!", "How are you?"]
document_embeddings = embedding_model.embed_documents(texts)
print(document_embeddings)

[-0.03552018478512764, -0.03240171819925308, -0.043218884617090225, 0.03457774221897125, 0.00269303354434669, -0.023269498720765114, 0.03638819605112076, 0.03506842255592346, 0.0008185613551177084, 0.026347391307353973, -0.01686306856572628, -0.007054108660668135, 0.047175709158182144, 0.007422426249831915, -0.013716100715100765, -0.04712850973010063, 0.03405393660068512, 0.08210980147123337, -0.11261429637670517, 0.01874205842614174, -0.002721605822443962, -0.06915194541215897, -0.0588352233171463, 0.015029589645564556, -0.0047036223113536835, -0.03829340264201164, 0.0046663908287882805, -0.06859877705574036, 0.03403034061193466, 0.0016564340330660343, 0.010664466768503189, 0.07998798042535782, -0.031778618693351746, -0.04906725510954857, 0.038223713636398315, 0.031131505966186523, -0.05103333666920662, -0.03974412754178047, 0.023270228877663612, -0.014968273229897022, -0.01651586964726448, -0.0017583590233698487, -0.046285223215818405, 0.016849584877490997, 0.02202308177947998, -0.00

In [4]:
import warnings

# load pdf
# from langchain_community.document_loaders import PyPDFLoader
# loader = PyPDFLoader(FILE_PATH_TO_EMBED)

# load directory of PDFs
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader(DIRECTORY_TO_EMBED)

pages = []
async for page in loader.alazy_load():
    pages.append(page)


print(f"LOADED DOCUMENT WITH {len(pages)} PAGES")
if len(pages) > 50:
    warnings.warn("DOCUMENT PAGES LOADED IN IS LARGE THAN 100, MAY INCUR SIGNIFICANT COSTS")


for page in pages:
    if page.page_content == None or page.page_content == "":
        warnings.warn("FOUND PAGES IN DOCUMENT WITHOUT PAGE_CONTENT")

ModuleNotFoundError: No module named 'langchain_community'

In [5]:
from IPython.display import Markdown
Markdown(pages[1].page_content)

NameError: name 'pages' is not defined

In [7]:
pages[10].__dict__

{'id': None,
 'metadata': {'producer': 'Microsoft® Word voor Microsoft 365',
  'creator': 'Microsoft® Word voor Microsoft 365',
  'creationdate': '2022-10-23T18:44:56+02:00',
  'author': 'Otto Hulst',
  'moddate': '2022-10-23T18:44:56+02:00',
  'source': 'pension-martijn-files\\Kader Datakwaliteit - wet toekomst pensioenen.pdf.pdf',
  'total_pages': 61,
  'page': 10,
  'page_label': '11'},
 'page_content': '11 \n \n \n \nIn onderstaand schema wordt iedere stap per fase kort toegelicht om een beeld te \ngeven van de reeks van werkzaamheden ten behoeve van het aantoonbaar toetsen van \ndatakwaliteit vóór het invaren. Vanaf hoofdstuk 3 zullen de stappen per fase één voor \néén uitgebreider worden toegelicht door in te gaan op de benodigdheden voor de stap \nen de uit te voeren activiteiten. Hierbij wordt door de pensioenuitvoerder elke stap \ndoorlopen en de uitvoering en uitkomsten daarvan worden gedocumenteerd. Op basis \nvan het principe ‘comply or explain’ kan de pensioenuitvoerder on

In [8]:
#for brevity we will drop a few pages from solvency II to test embedding cost for cheap model
from langchain_text_splitters import RecursiveCharacterTextSplitter

print(f"DOCUMENT WITH {len(pages)} PAGES")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # Each chunk will be 1000 characters
    chunk_overlap=100  # Overlap of 100 characters between chunks
)

split_pages = text_splitter.transform_documents(pages)

print(f"NUMBER OF CHUNKS: {len(split_pages)}")

DOCUMENT WITH 130 PAGES
NUMBER OF CHUNKS: 372


In [6]:
from langchain_chroma import Chroma
vectorstore = Chroma(COLLECTION_NAME, embedding_model, persist_directory=PERSIST_DIRECTORY)
print(vectorstore._collection.count())
# test_vectorstore = Chroma.from_documents(split_pages[:3], embedding_model, collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIRECTORY)
# print(test_vectorstore._collection.count())

0


In [7]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_chroma import Chroma

# In memory vector store for testing only
# vectorstore = InMemoryVectorStore(embedding_model)
# create a database with vector embedding and documents.

# note: behavior of from_documents is to add if vectorstore already exists
delete_vector_store_check = input("Do you want to delete the vector_store before adding these documents?")
if delete_vector_store_check == "Y":
    Chroma.delete_collection(Chroma(COLLECTION_NAME, embedding_model, persist_directory=PERSIST_DIRECTORY))

# Save a new vector store based on config at top of this file (create_vector_store.ipynb)
check = input("Are you sure you want to run this command? Creating a vectorstore will append to the existing vectorstore and may incur high costs (Y/n)")

if check == "Y":
    print("CREATING A NEW VECTOR STORE AT SPECIFIED LOCATION")
    vectorstore = Chroma.from_documents(split_pages, embedding_model, collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIRECTORY)
    print(vectorstore._collection.count())

# LLM
Test Cases for embedding

In [17]:
import os
from langchain_openai import AzureChatOpenAI

# setup the llm
llm_api_key = os.getenv("GPT_4O_MINI_API_KEY")
# print(llm_api_key)
llm = AzureChatOpenAI(
    model="gpt-4o-mini",
    azure_endpoint="https://openai-playground-bjorn.openai.azure.com/",
    api_version="2025-01-01-preview",
    api_key=llm_api_key
)

# load the vectorstore
from langchain_chroma import Chroma
vectorstore = Chroma(COLLECTION_NAME, embedding_model, persist_directory=PERSIST_DIRECTORY)
print(vectorstore._collection.count())

654


In [18]:
query = "kan je mij een introductie geven tot fase 4: rapportage en beoordeling"
matched_documents = vectorstore.similarity_search(query=query,k=1)

print(f"First documents of all matched documents: {matched_documents[0].__dict__}")

prompt = ""
document_source = []
for idx, document in enumerate(matched_documents):
    document_source.append(document.metadata["source"])
    prompt += f"""
    source {idx}:
    {document.page_content}

    """

print(document_source)
# print(prompt)

prompt += f"""
Based the above mentioned sources, can you provide an answer on the following question?
Question: {query}
Answer: 
"""

print(prompt)

First documents of all matched documents: {'id': '684ca3a6-d6bd-41c5-ba08-89d8b23ea302', 'metadata': {'author': 'Otto Hulst', 'creationdate': '2022-10-23T18:44:56+02:00', 'creator': 'Microsoft® Word voor Microsoft 365', 'moddate': '2022-10-23T18:44:56+02:00', 'page': 32, 'page_label': '33', 'producer': 'Microsoft® Word voor Microsoft 365', 'source': 'pension-martijn-files\\Kader Datakwaliteit - wet toekomst pensioenen.pdf.pdf', 'total_pages': 61}, 'page_content': '33 \n \n \n \n6. Fase 4: Rapportage en beoordeling \n \n6.1 Inleiding \nOp basis van de resultaten uit de vorige fasen wordt een rapportage opgesteld door \nde pensioenuitvoerder conform de daarvoor binnen de pensioenuitvoerder \nvastgestelde governance, waarbij sleutelhouders expliciet worden betrokken. De \nrapportage wordt vervol gens beoordeeld door het bestuur. Het kader onderscheidt \nvoor deze fase de volgende vier stappen:  \n \n1. Rapportage en evaluatie van fase 2 en 3.  \n2. Beoordeling en besluitvorming door het b

In [19]:
response = llm.invoke(prompt)

In [8]:
from IPython.display import Markdown
Markdown(response.content)

NameError: name 'response' is not defined