In [11]:
try:
    __import__('pysqlite3')
    import sys
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
    print("Successfully switched to pysqlite3-binary.")
except ImportError:
    print("pysqlite3-binary not found, using system sqlite3. THIS MAY CAUSE ISSUES.")
    pass # Fall back to system sqlite3 if pysqlite3-binary is not found, though this defeats the purpose for Chroma


import chromadb 
import torch
from sentence_transformers import SentenceTransformer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Configuration
VECTOR_DB_PATH = "../chroma_db_store"
COLLECTION_NAME = "research_papers_v1" # For versioning
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

# Initialize Chroma DB
chroma_client = chromadb.PersistentClient(path = VECTOR_DB_PATH)

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)

try:
    collection = chroma_client.get_or_create_collection(name = COLLECTION_NAME)
    print(f"Using/Created ChromaDB collection: {COLLECTION_NAME}")
except Exception as e:
    print(f"Error with ChromaDB collection: {e}")
    exit()

Failed to send telemetry event ClientStartEvent: module 'chromadb' has no attribute 'get_settings'


Successfully switched to pysqlite3-binary.


Failed to send telemetry event ClientCreateCollectionEvent: module 'chromadb' has no attribute 'get_settings'


Using/Created ChromaDB collection: research_papers_v1


In [12]:
sample_query = "What is the main focus of transformer models?"

query_embedding = embedding_model.encode(sample_query).tolist()

results = collection.query(
    query_embeddings = [query_embedding], 
    n_results = 5, 
    include = ['documents', 'metadatas', 'distances']
)

print(f"Query: '{sample_query}'")
if results and results['ids'][0]:
    for i in range(len(results['ids'][0])):
        print(f"  Result {i+1} (ID: {results['ids'][0][i]}, Distance: {results['distances'][0][i]:.4f}):")
        print(f"    Source: {results['metadatas'][0][i]['source_pdf']}")
        print(f"    Text: {results['documents'][0][i][:300]}...")
else:
    print("No results found for the sample query.")

Query: 'What is the main focus of transformer models?'
  Result 1 (ID: 2005.11401v4.pdf_chunk_43, Distance: 1.2025):
    Source: 2005.11401v4.pdf
    Text: ComputationalLinguistics. doi: 10.18653/v1/P19-1346. URLhttps://www.aclweb.org/
anthology/P19-1346.
[13] Angela Fan, Claire Gardent, Chloe Braud, and Antoine Bordes. Augmenting transformers
withKNN-basedcompositememory, 2020. URLhttps://openreview.net/forum?id=
H1gx1CNKPH.
[14] ThibaultFévry,LivioBa...
  Result 2 (ID: 2302.04761v1.pdf_chunk_64, Distance: 1.2068):
    Source: 2302.04761v1.pdf
    Text: Zhou, Chung-Ching Chang, Igor Krivokon, Will
Rusch, Marc Pickett, Pranesh Srinivasan, Laichee
Man, Kathleen Meier-Hellstern, Meredith Ringel
Morris, Tulsee Doshi, Renelito Delos Santos, Toju
Duke, Johnny Soraker, Ben Zevenbergen, Vinod-
kumar Prabhakaran, Mark Diaz, Ben Hutchinson,
Kristen Olson, Al...
  Result 3 (ID: 2002.08909v1.pdf_chunk_31, Distance: 1.2447):
    Source: 2002.08909v1.pdf
    Text: DrQA(Chenetal.,2017) SparseRetr.+D