In [48]:
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import tqdm
import faiss
import numpy as np

Load

In [133]:
file_dir = 'pdfs'
file_name = 'Fisica II.pdf'
embedding_model = OllamaEmbeddings(model='granite-embedding')

documents = PyPDFLoader(os.path.join(file_dir, file_name)).load()
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=100
)
splitted_documents = splitter.split_documents(documents)


Generate Embeddings

In [134]:
all_embeddings = []
for chunk in tqdm.tqdm(splitted_documents, desc = 'Chunks', ascii = True, colour = 'green'):
    emb = embedding_model.embed_query(chunk.page_content)
    all_embeddings.append(emb)

Chunks: 100%|[32m##########[0m| 1920/1920 [01:36<00:00, 19.86it/s]


Generate FAISS indexer

In [135]:
dim = len(all_embeddings[0])
index = faiss.IndexFlatL2(dim)
index.add(np.array(all_embeddings).astype('float32'))

vectorstore = FAISS(embedding_function=embedding_model.embed_query, index = index, docstore=None, index_to_docstore_id=None)


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


Save

In [136]:
save_path = os.path.join('embed_text', file_name.rstrip('.pdf').replace(' ', '_').replace('(', '').replace(')', '')[:31])
vectorstore.save_local(save_path)

Pipeline for a multiple pdf files folder

In [None]:
file_dir = 'pdfs'
files = [f for f in os.listdir(file_dir) if os.path.isfile(f) and f.endswith('.pdf')]
docs_list = [item for sublist in docs for item in sublist]

embedding_model = OllamaEmbeddings(model='granite-embedding')

documents = PyPDFLoader(documents = ).load()
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=100
)
splitted_documents = splitter.split_documents(documents)


In [106]:
import os
import tqdm
import numpy as np
import faiss

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.in_memory import InMemoryDocstore

# CONFIG
file_dir = 'pdfs'
save_dir = 'embed_text'
embedding_function = embedding_model.embed_query  # modelo previamente definido

# CRIA A PASTA SE NÃO EXISTIR
os.makedirs(save_dir, exist_ok=True)

# LISTA DE PDFs
files = [
    f for f in os.listdir(file_dir)
    if os.path.isfile(os.path.join(file_dir, f)) and f.endswith('.pdf')
]

# PIPELINE
for file_name in tqdm.tqdm(files, desc="Processando PDFs"):
    path = os.path.join(file_dir, file_name)
    docs = PyPDFLoader(path).load()

    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500, chunk_overlap=100
    )
    splitted_documents = splitter.split_documents(docs)

    if not splitted_documents:
        continue

    all_embeddings = []
    for chunk in tqdm.tqdm(splitted_documents, desc="Chunks", leave=False, colour="#00ff00"):
        emb = embedding_function(chunk.page_content)
        all_embeddings.append(emb)

    if not all_embeddings:
        continue

    dim = len(all_embeddings[0])
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(all_embeddings).astype('float32'))

    docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(splitted_documents)})
    index_to_docstore_id = {i: str(i) for i in range(len(splitted_documents))}

    vectorstore = FAISS(
        embedding_function=embedding_function,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id
    )

    base_name = os.path.splitext(file_name)[0].replace(' ', '_').replace('(', '').replace(')', '')
    save_path = os.path.join(save_dir, base_name)
    os.makedirs(save_dir, exist_ok=True)  # só cria embed_text

    vectorstore.save_local(save_path)


Processando PDFs:   0%|          | 0/4 [00:00<?, ?it/s]`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.
Processando PDFs:  25%|██▌       | 1/4 [00:06<00:19,  6.51s/it]`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.
Processando PDFs:  25%|██▌       | 1/4 [03:21<10:04, 201.60s/it]


RuntimeError: Error in __cdecl faiss::FileIOWriter::FileIOWriter(const char *) at D:\a\faiss-wheels\faiss-wheels\faiss\faiss\impl\io.cpp:102: Error: 'f' failed: could not open embed_text\Fundamentos_Circuitos_Elétricos_-_Sadiku_-_5ed_2\index.faiss for writing: No such file or directory