In [1]:
# from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai 
from dotenv import load_dotenv
import os
import shutil

In [2]:
# Load environment variables. Assumes that project contains .env file with API keys
load_dotenv()
#---- Set OpenAI API key 
# Change environment variable name from "OPENAI_API_KEY" to the name given in 
# your .env file.
openai.api_key = os.environ['OPENAI_API_KEY']

CHROMA_PATH = "chroma"
DATA_PATH = "data/books"

In [5]:
def main():
    generate_data_store()

def generate_data_store():
    documents = load_documents()
    print(documents)
    chunks = split_text(documents)
    print(len(chunks))
    save_to_chroma(chunks)

def load_documents():
    file_extensions = ["*.md", "*.txt"]

    if not os.path.exists(DATA_PATH) or not os.listdir(DATA_PATH):
        print(f"⚠️ Aucun fichier trouvé dans {DATA_PATH}")
        return []  # Retourne une liste vide pour éviter l'erreur

    loader = DirectoryLoader(DATA_PATH, glob="{*.md,*.txt}")
    documents = []
    for ext in file_extensions:
        loader = DirectoryLoader(DATA_PATH, glob=ext)
        documents.extend(loader.load())
    print(f"✅ {len(documents)} documents chargés.")
    return documents



def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    if len(chunks) > 10:
        print(f"📌 Nombre total de chunks générés : {len(chunks)}")

        document = chunks[10]
        print(document.page_content)
        print(document.metadata)
    else:
        print(f"❌ Erreur : la liste chunks contient seulement {len(chunks)} éléments, impossible d'accéder à index 10.")

    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [6]:
if __name__ == "__main__":
    main()

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


✅ 7 documents chargés.
Split 7 documents into 2525 chunks.
📌 Nombre total de chunks générés : 2525
He closed his eyes tightly and tried to remember what Voldemort had looked like, but it was impossible...All Harry knew was that at the moment when Voldemort's chair had swung around, and he, Harry, had seen what was sitting in it, he had felt a spasm of horror, which had awoken him...or had that been the pain in his scar!

And who had the old man been? For there had definitely been an old man; Harry had watched him fall to the ground. It was all becoming confused. Harry put his face into his hands, blocking out his bedroom, trying to hold on to the picture of that dimly lit room, but it was like trying to keep water in his cupped hands; the details were now trickling away as fast as he tried to hold on to them...Voldemort and Wormtail had been talking about someone they had killed, though Harry could not remember the name...and they had been plotting to kill someone else...him!

Harry to

  db.persist()
