In [2]:
# First vector database
# pip install chromadb
import chromadb
from pprint import pprint

client = chromadb.Client()

small_collection = client.create_collection("small-documents")
medium_collection = client.create_collection("medium-documents")
large_collection = client.create_collection("large-documents")

In [3]:
# Second vector database
# pip install pinecone-client
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="0c5c24cc-5d14-4349-8e1e-51065f843e89")

if "small-index" not in pc.list_indexes().names():
    pc.create_index(
        name="small-index",
        dimension=10,
        metric="euclidean",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

# I had to change it to 500 because otherwise it was too much data for pinecone
if "medium-index" not in pc.list_indexes().names():
    pc.create_index(
        name="medium-index",
        dimension=500,
        metric="euclidean",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

if "large-index" not in pc.list_indexes().names():
    pc.create_index(
        name="large-index",
        dimension=100,
        metric="euclidean",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

In [4]:
small_index = pc.Index("small-index")
medium_index = pc.Index("medium-index")
large_index = pc.Index("large-index")

All the datasets and the code for importing and vectorizing them was done by chatgpt

# Small dataset

In [5]:
# Chromadb
# Small dataset documents
small_docs = [
    "Climate change refers to long-term shifts in temperatures and weather patterns.",
    "Human activities, such as burning fossil fuels, are the biggest contributor to climate change.",
    "Reducing carbon emissions is crucial to mitigating the impacts of climate change."
]
small_metadatas = [
    {"dim1": 1, "dim2": 2, "dim3": 3},
    {"dim1": 2, "dim2": 3, "dim3": 4},
    {"dim1": 3, "dim2": 4, "dim3": 5},
]
small_ids = ["small_1", "small_2", "small_3"]

# Add small dataset to collection
small_collection.add(documents=small_docs, metadatas=small_metadatas, ids=small_ids)

In [6]:
# Pinecone
# Small dataset
# Vectorize small 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10)

small_vectors = vectorizer.fit_transform(small_docs).toarray()

# Add small dataset to Pinecone
small_index.upsert(
    vectors = [
        {"id": f"small_{i}", "values": small_vectors[i].tolist()} for i in range(len(small_docs))
    ]
)

{'upserted_count': 3}

# Medium dataset

In [7]:
from sklearn.datasets import fetch_20newsgroups
import json
newsgroups = fetch_20newsgroups(subset='all')

In [8]:
# Chromadb
# Medium dataset
# Changed the dataset dimension to 500 because 1000 was too much for pinecone
documents = newsgroups.data[:100]
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(documents)
X_dense = X.toarray()

medium_docs = documents
medium_metadatas = [{"vector": json.dumps(X_dense[i].tolist())} for i in range(len(documents))]
medium_ids = [f"medium_{i}" for i in range(len(documents))]

# Add medium dataset to collection
medium_collection.add(documents=medium_docs, metadatas=medium_metadatas, ids=medium_ids)

In [9]:
# Pinecone
medium_index.upsert(
    vectors = [
        {"id": f"medium_{i}", "values": X_dense[i].tolist()} for i in range(len(documents))
    ]
)

{'upserted_count': 100}

# Large dataset

In [10]:
# Chromadb
# Large dataset
documents = newsgroups.data[:1000]
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(documents)
X_dense = X.toarray()

large_docs = documents
large_metadatas = [{"vector": json.dumps(X_dense[i].tolist())} for i in range(len(documents))]
large_ids = [f"large_{i}" for i in range(len(documents))]

# Add large dataset to collection
large_collection.add(documents=large_docs, metadatas=large_metadatas, ids=large_ids)

In [11]:
# Pinecone
large_index.upsert(
    vectors = [
        {"id": f"large_{i}", "values": X_dense[i].tolist()} for i in range(len(documents))
    ]
)

{'upserted_count': 1000}

# Queries

In [12]:
small_query_sentence = "What are the main causes of climate change?"

In [13]:
import time

In [14]:
# Chromadb small dataset
start_time = time.time()

small_chromadb_results = small_collection.query(
    query_texts=[small_query_sentence],
    n_results=1
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(small_chromadb_results)

Time:  0.017027854919433594
{'ids': [['small_2']], 'distances': [[0.6976820826530457]], 'metadatas': [[{'dim1': 2, 'dim2': 3, 'dim3': 4}]], 'embeddings': None, 'documents': [['Human activities, such as burning fossil fuels, are the biggest contributor to climate change.']], 'uris': None, 'data': None}


In [15]:
# Pinecone small dataset
vectorizer = TfidfVectorizer(max_features=10)
vectorizer.fit(small_docs)

small_query_vector = vectorizer.transform([small_query_sentence]).toarray()[0]

small_query_vector_list = small_query_vector.tolist()

start_time = time.time()

small_result = small_index.query(
    vector=small_query_vector_list,
    top_k=10,
    include_values=True,
    include_metadata=True
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(small_result)

Time:  0.43926143646240234
{'matches': [{'id': 'small_2',
              'score': 0.22763288,
              'values': [0.0,
                         0.463334262,
                         0.463334262,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.596627235,
                         0.463334262]},
             {'id': 'small_1',
              'score': 0.813363552,
              'values': [0.525234282,
                         0.310211837,
                         0.310211837,
                         0.0,
                         0.0,
                         0.525234282,
                         0.0,
                         0.0,
                         0.399454236,
                         0.310211837]},
             {'id': 'small_0',
              'score': 1.45015335,
              'values': [0.0,
                         0.262912303,
                      

In [16]:
medium_query_sentence = "What are the common troubleshooting steps for Windows 10?"

In [17]:
# Chromadb medium dataset

start_time = time.time()

medium_chromadb_results = medium_collection.query(
    query_texts=[medium_query_sentence],
    n_results=1
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(medium_chromadb_results)

Time:  0.02098560333251953
{'ids': [['medium_96']], 'distances': [[1.3571739196777344]], 'metadatas': [[{'vector': '[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11010648114151518, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11473226904137698, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05233192456834755, 0.0, 0.0, 0.0, 0.0, 0.16568455051810232, 0.0, 0.06357580897110107, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10256498185844204, 0.0, 0.05051268742747719, 0.08348008943475531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09153021483510353, 0.0, 0.0, 0.0, 0.0, 0.14639239508046234, 0.0, 0.0570620889995523, 0.0744091777719821, 0.0, 0.0, 0.05296369995608704, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10256498185844204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.17447202575638934, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [18]:
# Pinecone medium dataset
vectorizer = TfidfVectorizer(max_features=500)
vectorizer.fit(medium_docs)

medium_query_vector = vectorizer.transform([medium_query_sentence]).toarray()[0]

medium_query_vector_list = medium_query_vector.tolist()

start_time = time.time()

medium_result = medium_index.query(
    vector=medium_query_vector_list,
    top_k=10,
    include_values=True,
    include_metadata=True
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(medium_result)

Time:  0.7175731658935547
{'matches': [{'id': 'medium_239',
              'score': 1.06917262,
              'values': [0.825805724,
                         0.0772861093,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.04732389,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
           

In [19]:
large_query_sentence = "What are the effects of gun control laws on crime rates?"

In [20]:
# Chromadb large dataset

start_time = time.time()

large_chromadb_results = large_collection.query(
    query_texts=[large_query_sentence],
    n_results=1
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(large_chromadb_results)

Time:  0.019973039627075195
{'ids': [['large_765']], 'distances': [[0.9361464977264404]], 'metadatas': [[{'vector': '[0.04442810602320783, 0.044665998808998734, 0.0, 0.0, 0.028406574304360676, 0.11286860432793275, 0.07734180460094031, 0.2538185176971121, 0.01312001639228013, 0.05470447163077677, 0.041963953218388965, 0.1316368207580525, 0.04162417330413572, 0.13297775975114942, 0.04988282171585737, 0.07345980876147554, 0.0, 0.12172395862843235, 0.0, 0.01986929472738949, 0.0, 0.07565840021611339, 0.0, 0.09977447711878704, 0.021420855342937285, 0.019456868252103538, 0.07983275754205772, 0.015106053143255279, 0.0176484075304142, 0.05761702671743494, 0.04719330413791479, 0.1487729638713895, 0.0, 0.0220421650647987, 0.0, 0.01709482457004275, 0.153189396651558, 0.12266064732494401, 0.20126140422055278, 0.15349502771055018, 0.06447877493569833, 0.01665421440679261, 0.04981208399724657, 0.007568132629806248, 0.0615374817496962, 0.02072526929884091, 0.11122087427180745, 0.051607028334286646, 0.

In [21]:
# Pinecone large dataset
vectorizer = TfidfVectorizer(max_features=100)
vectorizer.fit(large_docs)

large_query_vector = vectorizer.transform([large_query_sentence]).toarray()[0]

large_query_vector_list = large_query_vector.tolist()

start_time = time.time()

large_result = large_index.query(
    vector=large_query_vector_list,
    top_k=10,
    include_values=True,
    include_metadata=True
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(large_result)

Time:  0.5058565139770508
{'matches': [{'id': 'large_70',
              'score': 0.880530834,
              'values': [0.0,
                         0.0640703887,
                         0.0807309523,
                         0.0,
                         0.061121,
                         0.186810553,
                         0.0665649921,
                         0.436902851,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.107330367,
                         0.0632239133,
                         0.0,
                         0.0,
                         0.063814126,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0837287307,
                         0.300601244,
      

Pinecone was faster on 2 out of 3 queries

In [22]:
# Loading pdf
import pdfplumber
import re

pdf_file = "./Data/Nordic_nutrition_recommendations_2023.pdf"

pdf_text = ""

with pdfplumber.open(pdf_file) as pdf:
    # Excluding first 6 since its just the title page and table of contents
    for i in range(6, len(pdf.pages)):
        page = pdf.pages[i]
        pdf_text += page.extract_text()

pdf_text = re.sub(r'\s+', ' ', pdf_text)

In [23]:
pdf_texts = []
chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]
pdf_texts.extend(chunks)

In [24]:
# Vectorize pdf document
vectorizer = TfidfVectorizer(max_features=500)
pdf_vector = vectorizer.fit_transform(pdf_texts)
pdf_vectors = pdf_vector.toarray()
pdf_vector_list = pdf_vectors.tolist()

In [25]:
if "pdf-index" not in pc.list_indexes().names():
    pc.create_index(
        name="pdf-index",
        dimension=500,
        metric="euclidean",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

In [26]:
pdf_index = pc.Index("pdf-index")

In [27]:
# Was too large so have to split it up
def split_into_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

batch_size = 100

batches = list(split_into_batches(pdf_vector_list, batch_size))

def upsert_to_pinecone(batch, start_index):
    vectors = [{"id": f"pdf_{start_index + i}", "values": vector} for i, vector in enumerate(batch)]
    pdf_index.upsert(vectors=vectors)

for batch_num, batch in enumerate(batches):
    start_index = batch_num * batch_size
    upsert_to_pinecone(batch, start_index)

In [28]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"
username = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(username, password))

def create_document(tx, document_id, document_vector):
    tx.run("CREATE (d:Document {id: $document_id, vector: $document_vector})", 
           document_id=document_id, document_vector=document_vector)

for i in range(len(pdf_vector_list)):
    document_id = f"pdf_{i}"
    document_vector = pdf_vector_list[i]
    
    with driver.session() as session:
        session.execute_write(create_document, document_id, document_vector)

driver.close()

In [29]:
pdf_query_sentence = "Is physical activity important for your health?"

vectorizer = TfidfVectorizer(max_features=500)
vectorizer.fit(pdf_texts)

pdf_query_vector = vectorizer.transform([pdf_query_sentence]).toarray()[0]

pdf_query_vector_list = pdf_query_vector.tolist()

start_time = time.time()

pdf_result = pdf_index.query(
    vector=pdf_query_vector_list,
    top_k=10,
    include_values=True,
    include_metadata=True
)

elapsed_time = time.time() - start_time

print("Time: ", elapsed_time)
print(pdf_result)

Time:  0.5634458065032959
{'matches': [{'id': 'pdf_101',
              'score': 0.989710212,
              'values': [0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
                         0.0,
       