## Testing created vector ChromaDB loading & querying

In [None]:
# imports
import chromadb
from openai import OpenAI
from dotenv import load_dotenv

In [None]:
# loading env vars
load_dotenv()
openai_client = OpenAI()

In [28]:
# connect to existing Chroma DB
chroma_client = chromadb.PersistentClient(path="../persistence/chroma")

# get collection
collection = chroma_client.get_collection("phone_transcripts_v0.1")

<h4 style="color:#3f79ffff; margin:0; font-weight:600;">Setting a query for a broken blender refund and embedding the query so I can search for similarity in my ChromaDB</h4>

In [29]:
query = "refund request for a broken blender product"

# embed query with OpenAI
query_emb = openai_client.embeddings.create(
    model="text-embedding-3-large",
    input=query
).data[0].embedding

print(f"Query embedding length: {len(query_emb)}")
print(f"Query embedding sample: {query_emb[:5]}...")

Query embedding length: 3072
Query embedding sample: [-0.054896049201488495, 0.007448470685631037, -0.0025120845530182123, 0.01097472757101059, -0.006850800011307001]...


<h4 style="color:#3f79ffff; margin:0; font-weight:600;">Searching for similarities in the ChromaDB</h4>


In [None]:
# test: search for refund request for a broken blender product
results = collection.query(
    query_embeddings=[query_emb],
    n_results=5,
    
)

print(results)

docs = results["documents"][0]
metas = results["metadatas"][0]
ids = results["ids"][0]

for i, m, d in zip(ids, metas, docs):
    print(f"[Call {m['call_id']} | Chunk {i} | {m['speaker']} | {m['date']}] {d[:150]}...\n")




{'ids': [['1_2', '1_1', '78_4', '32_4', '7_3']], 'embeddings': None, 'documents': [["Agent: I'm so sorry to hear about that issue with your blender. Could you please provide me with the order number so I can look into this for you?", "Caller: Hi there, I recently purchased a blender from your site, but it's making this awful grinding noise after just a couple of uses. I'm really disappointed because I was looking forward to making smoothies every morning.", 'Agent: Must be defective—refund?', "Agent: Fill out the online form, and we'll resend or refund once approved.", "Caller: Order number 11223. I'd prefer a full refund rather than a replacement, if possible."]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'call_id': '1', 'speaker': 'Agent', 'duration': '23520', 'date': '2025-01-15'}, {'date': '2025-01-15', 'duration': '23520', 'speaker': 'Caller', 'call_id': '1'}, {'date': '2025-06-22', 'duration': '31500', 'speaker': 'Agent', 'ca

<h4 style="color:#3f79ffff; margin:0; font-weight:600;">Searching with the metadata so we only find "Caller" transcripts</h4>


In [32]:
results = collection.query(
    query_embeddings=[query_emb],
    n_results=5,
    where={"speaker": "Caller"},  # filter metadata speaker = Caller
    include=["distances", "documents", "metadatas"],
)

print(results)

docs = results["documents"][0]
metas = results["metadatas"][0]
ids = results["ids"][0]
dists = results["distances"][0]

for i, m, d, dist in zip(ids, metas, docs, dists):
    print(f"[Call {m['call_id']} | Chunk {i} | {m['speaker']} | {m['date']} | distance={dist:.4f}] {d[:100]}...\n")

{'ids': [['1_1', '7_3', '7_5', '13_3', '32_3']], 'embeddings': None, 'documents': [["Caller: Hi there, I recently purchased a blender from your site, but it's making this awful grinding noise after just a couple of uses. I'm really disappointed because I was looking forward to making smoothies every morning.", "Caller: Order number 11223. I'd prefer a full refund rather than a replacement, if possible.", "Caller: I'd like to return it—please send a label. How long will the refund take once you receive it?", "Caller: Yes, but I'd like a refund on the shipping fee as compensation.", 'Caller: How does that process work? Will I get a replacement?']], 'uris': None, 'included': ['distances', 'documents', 'metadatas'], 'data': None, 'metadatas': [[{'call_id': '1', 'date': '2025-01-15', 'speaker': 'Caller', 'duration': '23520'}, {'date': '2025-07-09', 'speaker': 'Caller', 'call_id': '7', 'duration': '24000'}, {'date': '2025-07-09', 'duration': '24000', 'call_id': '7', 'speaker': 'Caller'}, {'s