In [1]:
# import importlib
# import db_utils
# importlib.reload(db_utils)

In [None]:
%pip install langchain-elasticsearch langchain-community langchain_experimental langchain_openai tqdm pypdf streamlit langchain-ollama --use-pep517

In [3]:
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_community.llms.ollama import Ollama

from db_utils import add_documents_to_db, remove_index, remove_document, fetch_all
from files_utils import chunk_documents
from model_utils import rag_prompt
from config import MODEL_NAME, ES_PORT, ES_INDEX_NAME, ES_DISTANCE_STRATEGY, CONTEXT_CHAR_THRESHOLD, CHUNKER_TYPE
import pickle

Prepare chunks

In [None]:
pdf_directory = "./data/college"

document_chunks = chunk_documents(pdf_directory=pdf_directory, chunker_type=CHUNKER_TYPE)

In [None]:
document_chunks

In [None]:
len(document_chunks)

In [7]:
# save document_chunks to a file
with open("document_chunks_FastEmbedEmbeddings_college.pkl", "wb") as f:
    pickle.dump(document_chunks, f)

In [8]:
# %pip install langchain-elasticsearch langchain-community langchain_experimental langchain_openai tqdm pypdf streamlit fastembed --use-pep517

Prepare database

In [13]:
from langchain_ollama import OllamaEmbeddings

embedding = OllamaEmbeddings(
    model="Lexi-Llama-3-8B-Uncensored_Q8_0.gguf",
)
db_kwargs = {
    "embedding": embedding,
    "es_url": "http://localhost:9200",
    "index_name": "rag",
    "distance_strategy": "COSINE"
}

In [None]:
# clear index 
remove_index(index_name=db_kwargs["index_name"], db_config={"hosts": db_kwargs["es_url"]})

In [None]:
#print type of document_chunks, db_kwargs
print(type(document_chunks), type(db_kwargs))

In [None]:
db = add_documents_to_db(db=None, document_chunks=document_chunks, db_kwargs=db_kwargs, bulk_upload=True)

In [None]:
fetch_all(
    index_name=db_kwargs["index_name"],
    db_config={"hosts": db_kwargs["es_url"]},
)

Prompt rag

In [None]:
model = Ollama(model=MODEL_NAME)

response = rag_prompt(
    context_char_threshold=CONTEXT_CHAR_THRESHOLD,
    query = "Chciałbym wypełnić deklarację PCC-3.",
    model=model,
    db=db,
    chunker_type=CHUNKER_TYPE,
)

# remove_document(
#     index_name=db_kwargs["index_name"],
#     source_file="",
#     db_config={"hosts": db_kwargs["es_url"]},
#     )
