In [1]:
import os
import pinecone
from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")

# get api key from app.pinecone.io for .env file
lines = open("../.env").read().splitlines()
for line in lines:
    if line.startswith("PINECONE_API_KEY"):
        api_key = line.split("=")[1]
        break
api_key = api_key or os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
env = os.environ.get("PINECONE_ENVIRONMENT") or "gcp-starter"

pinecone.init(api_key=api_key, environment=env)

  from tqdm.autonotebook import tqdm


In [2]:
# list all indexes
indexes = pinecone.list_indexes()
indexes

[]

In [3]:
# delete an index
for index in indexes:
    pinecone.delete_index(index.name)

In [4]:
# create an index with dimension 100
pinecone.create_index("test-1", dimension=100)

In [5]:
index = pinecone.Index("test-1")

In [6]:
import numpy as np
from tqdm import tqdm

batch_size = 1000
dim = 100
vectors = [
    {
        "id": str(i),
        "values": np.random.rand(dim).tolist(),
        "metadata": {
            "a": "X" if np.random.rand(1) > 0.5 else "Y",
            "b": "X" if np.random.rand(1) > 0.5 else "Y",
            "c": "X" if np.random.rand(1) > 0.5 else "Y",
            "d": "X" if np.random.rand(1) > 0.5 else "Y",
        },
    }
    for i in range(20_000)
]
print(f"Created {len(vectors)} vectors")
successful_count = 0
# upsert 20000 random vectors
for i in tqdm(range(0, len(vectors), batch_size)):
    batch_vectors = vectors[i : i + batch_size]
    index.upsert(batch_vectors)

Created 20000 vectors


100%|██████████| 20/20 [00:26<00:00,  1.33s/it]


In [7]:
# search for nearest neighbors of a random vector with k=10_000
query_vector = np.random.rand(dim).tolist()
results = index.query(vector=query_vector, top_k=10_000)
len(results["matches"])

10000

In [8]:
# mark all results as "e": "X"
for j in tqdm(range(0, len(results["matches"]), batch_size)):
    batch_ids = results["matches"][j : j + batch_size]
    # fetch all results
    data = index.fetch(ids=[i["id"] for i in batch_ids])
    batch_vectors = data["vectors"]
    upsert_data = []
    for id, vector_data in batch_vectors.items():
        if "metadata" not in vector_data:
            vector_data["metadata"] = {}
        vector_data["metadata"]["e"] = "X"
        cur_vec = pinecone.Vector(
            id=id,
            values=vector_data["values"],
            metadata=vector_data["metadata"],
        )
        if vector_data.get("sparseValues"):
            cur_vec.sparse_values = vector_data["sparseValues"]
        upsert_data.append(cur_vec)
    index.upsert(upsert_data)

100%|██████████| 10/10 [00:19<00:00,  1.96s/it]


In [9]:
# search for nearest neighbors of a random vector with k=10_000 and filter by metadata $ne: {"e": "X"}
query_vector = np.random.rand(dim).tolist()
results2 = index.query(vector=query_vector, top_k=10_000, filter={"e": {"$eq": "X"}})
len(results2["matches"])

7000