In [None]:
import glob

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import LlamaCppEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

## Load models

In [None]:
embeddings_model = LlamaCppEmbeddings(
    model_path="models/mxbai-embed-large-v1.Q8_0.gguf", 
    n_gpu_layers=-1 # Set to 0 for only cpu
)

## Load test pdf files

In [None]:
pdf_paths = glob.glob("test-data/*.pdf")

pages = []

for path in pdf_paths:
    loader = PyPDFLoader(path)
    async for page in loader.alazy_load():
        pages.append(page)

## Load test txt files

In [None]:
loader = DirectoryLoader(path="test-data", glob="*.txt", loader_cls=TextLoader)
pages = pages + loader.load()

## Split data in chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(pages)

## Generate embeddings and save them in vector store

In [None]:
index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))  #Get dimensions of embeddings

vectorstore = FAISS(
    embedding_function=embeddings_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

vectorstore.add_documents(documents=splits)

## Ähnlichkeitssuche mit Query

Score: Distanz zwischen Embeddings -> Je näher an 0 desto besser

In [None]:
result = vectorstore.similarity_search_with_score("Datenhaltung", k=4)

for [doc, score] in result:
    print(f"Name: {doc.metadata['source']} - Seite: {doc.metadata['page']} - Score: {score}")


## Ähnlichkeitssuche mit Dokument

Score: Distanz zwischen Embeddings -> Je näher an 0 desto besser

In [None]:
embedding = await embeddings_model.aembed_documents([splits[0].page_content]) #Generate embedding for first page in first document

result = vectorstore.similarity_search_with_score_by_vector(embedding[0], k=4)

for [doc, score] in result:
    print(f"Name: {doc.metadata['source']} - Seite: {doc.metadata['page']} - Score: {score}")