In [1]:
import os
import asyncio
from langchain_aws import BedrockEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader

In [26]:
# Configuration des embeddings Bedrock
embeddings = BedrockEmbeddings(
    credentials_profile_name="default",
    region_name="us-west-2",
    model_id='amazon.titan-embed-text-v2:0',
)

pdfs_directory = './documents'

# Fonction pour charger des documents à partir de fichiers PDF
def load_documents_from_pdfs(file_path):
    loader = PyPDFLoader(file_path)
    return  loader.load() # Added return statement

# Fonction d'embedding des documents
async def embed_documents(documents):
    # Embedding de tous les documents
    document_embeddings = await embeddings.aembed_documents([doc.page_content for doc in documents])
    return document_embeddings

In [27]:
print("Chargement des documents depuis les fichiers PDF...")
documents = []

# Charger tous les fichiers PDF dans le répertoire spécifié
for filename in os.listdir(pdfs_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(pdfs_directory, filename)
        docs = load_documents_from_pdfs(file_path)  # Call to load documents


Chargement des documents depuis les fichiers PDF...


In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)


docs = text_splitter.split_documents(docs)
docs

[Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 0}, page_content='Recommended Practices for  \nSafety  and Health  \nProgramsWorker Participation'),
 Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 0}, page_content='Find and Fix HazardsManagement Leadership\nOccupational Safety  \nand Health Administration'),
 Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 0}, page_content='osha.gov/safetymanagement\n      OSHA 3885 October 2016'),
 Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 1}, page_content='DISCLAIMER\nThese practices for safety and health programs are'),
 Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 1}, page_content='recommendations only.  Employers are not required to'),
 Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 1}, page_content='have a safety and health program that complies with'),
 Document(metadata={'source': './documents\\OSHA3885.pdf', 'page': 1}, page_

In [None]:
db = Chroma.from_documents(docs, embeddings, persist_directory="my_embeddings")

In [6]:
!pip install -qU "langchain-chroma>=0.1.2"