In [5]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_chroma import Chroma
from embedding_function import get_embedding_function
from langchain_huggingface import HuggingFaceEmbeddings

In [6]:
DATA_PATH = "Data"
CHROMA_PATH = "chroma"

os.makedirs(CHROMA_PATH, exist_ok=True)


In [7]:
def main():
    create_data_store()


#def create_data_store():
 #   documents = load_document()
  #  chunks = splittext(documents)
   # save_to_chroma(chunks)
    
def load_document():
        document_loader = PyPDFDirectoryLoader(DATA_PATH)
        return document_loader.load()

def splittext(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, 
        chunk_overlap=80,
        length_function=len,
    )
    
    chunks = text_splitter.split_documents(documents)
    # 🩹 Ensure result is a flat list
    return [chunk for chunk in chunks if isinstance(chunk, Document)]

# Helper function to calculate chunk IDs
def calculate_chunk_ids(chunks: list[Document]):
    ids = [f"chunk_{i}" for i in range(len(chunks))]
    for chunk, id in zip(chunks, ids):
        chunk.metadata["chunk_id"] = id
    return chunks, ids


#laod den existerende database og kalkuler side id
def add_chunks_to_db(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    new_chunks, new_chunks_ids = calculate_chunk_ids(chunks)

    db.add_documents(new_chunks, ids=new_chunks_ids)


    print(f"Database updated. Total documents in the database: {db._collection.count()}")

    # exempel på en load document og split document funktion



# Load the existing database.
def add_to_chroma(chunks: list[Document]):
    db = Chroma(   
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )
    db.add_documents(chunks)
    #db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    print("Documents in DB:", db._collection.count())

def create_data_store():
    documents = load_document()
    chunks = splittext(documents)
    add_to_chroma(chunks)
   

    for doc in chunks:  # Use chunks instead of documents
        print(f"Content: {doc.page_content[:200]}...")  # Print first 200 chars


def save_to_chroma(chunks: list[Document]):
  if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

    # Create a new database.
    db = Chroma.from_documents(
        chunks, get_embedding_function(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


In [8]:
if __name__ == "__main__":
    create_data_store()

Saved 1978 chunks to chroma.
Documents in DB: 3956
Content: Annual Report  
2024
A.P . Møller - Mærsk A/S 
Esplanaden 50, DK-1263 Copenhagen K | Registration no. 22756214 ALL THE WAY...
Content: A.P . Moller - Maersk (Maersk) has prepared an integrated Annual Report in 2024  
and no longer publishes separate sustainability and financial reports. 
About this report
The 2024 Integrated Annual R...
Content: the mandatory European Sustainability Reporting Standards (ESRS). 
As a result, the new sustainability statement has been included in 
the Management Review. The sustainability statement is prepared i...
Content: not included in the sustainability statement and are instead integrated 
into other parts of the Annual Report and the Remuneration Report. 
The Annual Report’s consolidated financial statements have ...
Content: company’s performance, strategy, corporate governance, sustainability 
statement and financial results, including the Q4 results. 
Remuneration Report and policy
The 