In [1]:
import os
import warnings

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader

from dotenv import load_dotenv

# Ignore all warnings (not recommended in production code)
warnings.filterwarnings("ignore")

In [2]:
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

### Load Documents:

In [4]:
pdf_loader = DirectoryLoader('./documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
pdf_documents = pdf_loader.load()

In [5]:
if verbose:
    for document in pdf_documents:
        print(document.metadata["source"])

### Create Chunks:

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pdf_chunks = text_splitter.split_documents(pdf_documents)

### Setup ChromaDB:

In [7]:
## here we are using OpenAI embeddings but in future we will swap out to something else
embedding = OpenAIEmbeddings()
persist_directory = 'db'

# Batch chunk embedding + insert to DB
vectordb = Chroma.from_documents(documents=pdf_chunks, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

### Load VectorDB from disk:

In [8]:
# load from disk
vectordb = None # Pretent we want to load db

vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

### Insert new Document Chunks to DB:

In [9]:
txt_loader = DirectoryLoader('./documents/', glob="./*.txt", loader_cls=TextLoader) # ***
txt_documents = txt_loader.load()

# *** Hint: The shitty thing with the DirectoryLoader is that you cannot enforce utf-8 encoding
# *         this needs to be done with  TextLoader that works on file-level
# *         workaround for future: OwnDirectoryLoader 

txt_chunks = text_splitter.split_documents(txt_documents)

In [10]:

vectordb.add_documents(txt_chunks)

['109d4819-5280-11ee-ac93-b06088bbcb77',
 '109d481a-5280-11ee-b7ad-b06088bbcb77',
 '109d481b-5280-11ee-82b9-b06088bbcb77',
 '109d481c-5280-11ee-bdf1-b06088bbcb77',
 '109d481d-5280-11ee-b46c-b06088bbcb77']

### Query Documents similar to Query

In [11]:
docs = vectordb.similarity_search("Dear Fellow Scholars, this is Two Minute Papers with Dr", k = 3)
docs

[Document(page_content="Dear Fellow Scholars, this is Two Minute Papers with Dr. KÃ¡roly Zsolnai-FehÃ©r. Today, we are going to have a look at an image creation AI called Perfusion that is completely out of this world. Imagine a text-to-image AI like Stable Diffusion or Meat Journey as a chef that can cook any dish. However, sometimes, we like the dish we get, but we would like to introduce our own ingredients to it. And normally, if we write a new recipe, a new text prompt, we get a completely new image, a new dish. However, that is not what we want. What we want is the previous dish with a new ingredient. Let's have a look at an example. We have photos of our favorite teddy bear, and we would like it to create an image of it playing with a ball in the water. A previous technique doesâ€¦uh-oh. I have already looked at this image, and I have bad news. It's not going to be pretty. Are you ready? Okay, here you go! Whoa! It seems to have fallen apart, and there is no ball anywhere to be 