## Retrieval from Vector Database

In [1]:
!source ../rag_venv/bin/activate

In [2]:
# Imports 
import sys
sys.path.append('../')

from VectorDatabase import VectorDatabase
from Document       import Document
from os             import listdir
from os.path        import isfile, join

In [3]:
# Initialise vector database
v_db = VectorDatabase()
v_db.setClient(persistent=False)
v_db.createCollection("Bible_John")

In [4]:
# Add documents into vector database
dataset_folder = "documents"

def check_is_txt(file):
    if (isfile(join(dataset_folder, file))) and (file.endswith('.txt')):
        return True

metadatas = [{"filepath": join(dataset_folder,f), 
              "book"    :(f.split('_')[0]), 
              "chapter" :(f.split('_')[1].split('.')[0])
              } for f in listdir(dataset_folder) if check_is_txt(f)]

In [5]:
metadatas

[{'filepath': 'documents/john_3.txt', 'book': 'john', 'chapter': '3'},
 {'filepath': 'documents/john_2.txt', 'book': 'john', 'chapter': '2'},
 {'filepath': 'documents/john_1.txt', 'book': 'john', 'chapter': '1'}]

In [6]:
# initialise lists to be added to collection
doc_list     = []
doc_metadata = []
embeddings   = []

In [7]:
# read in each dataset file
for item in metadatas:
    path = item["filepath"]

    doc  = Document(item["filepath"])

    doc_list.append(doc.fulltext)
    doc_metadata.append(item)
    embeddings.append(doc.embedding[0])

In [8]:
print(len(doc_list) == len(doc_metadata))

True


In [9]:
# add to collection
v_db.addToCollection(docs   = doc_list, 
                     embeds = embeddings,
                     meta   = doc_metadata, 
                     ids    = [f"id{i}" for i in range(len(doc_list))])

In [10]:
# Query the database and retrieve the most relevant documents
query       = "Kingdom"
num_results = 1

results = v_db.queryDatabase(query, num_results)

print("Results: ")
for i in range(num_results):
    # print(results["documents"][0][i])
    print(results["metadatas"][0][i])

Results: 
{'book': 'john', 'chapter': '3', 'filepath': 'documents/john_3.txt'}
