In [1]:
import os
import pinecone
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')

# get api key from app.pinecone.io for .env file
lines = open('../.env').read().splitlines()
for line in lines:
    if line.startswith('PINECONE_API_KEY'):
        api_key = line.split('=')[1]
        break
api_key = api_key or os.environ.get('PINECONE_API_KEY')
# find your environment next to the api key in pinecone console
env = os.environ.get('PINECONE_ENVIRONMENT') or 'gcp-starter'

pinecone.init(
    api_key=api_key,
    environment=env
)

  from tqdm.autonotebook import tqdm


In [4]:
import csv
from tqdm.notebook import tqdm
import numpy as np

# delete index
index_name = "test-1"
PINECONE_MAX_K = 10_000

def try_random_search_for_dim(dim, tot_count=100_000, batch_size=200):
    pinecone.delete_index(index_name)
    pinecone.list_indexes()
    # only create index if it doesn't exist
    if index_name not in pinecone.list_indexes():
        pinecone.create_index(name=index_name, dimension=dim, metric="cosine")
        print(f"Created index {index_name}")

    # now connect to the index
    index = pinecone.Index(index_name)
    # create list of random 1 d vectors to upsert, random metadata from keys a,b,c,d set to X or Y randomly
    # random numbers are between 0 and 1

    vectors = [
        {
            "id": str(i),
            "values": np.random.rand(dim).tolist(),
            "metadata": {
                "a": "X" if np.random.rand(1) > 0.5 else "Y",
                "b": "X" if np.random.rand(1) > 0.5 else "Y",
                "c": "X" if np.random.rand(1) > 0.5 else "Y",
                "d": "X" if np.random.rand(1) > 0.5 else "Y",
            },
        }
        for i in range(tot_count)
    ]
    print(f"Created {len(vectors)} vectors")
    successful_count = 0
    for i in tqdm(range(0, len(vectors), batch_size)):
        batch_vectors = vectors[i : i + batch_size]
        try:
            index.upsert(batch_vectors)
            successful_count += len(batch_vectors)
        except Exception as e:
            print(f"Error upserting vectors at index {i}")
            print(e)
            break
    print(f"Successfully upserted {successful_count} vectors")
    # upsert vectors
    # index.upsert(vectors)
    # how many vectors are in the index?
    vec_count = index.describe_index_stats()["total_vector_count"]
    print(f"Vector count(according to Pinecone): {vec_count}")
    all_ids = set()
    # search for vectors similar to a random vector
    # tqdm with total as vec_count to show progress bar, and len(all_ids) to show how many unique ids have been found
    pbar = tqdm(total=successful_count)
    # clear the progress bar
    for i in range(100):
        query_vector = np.random.rand(dim).tolist()
        results = index.query(
            vector=query_vector, include_values=False, top_k=PINECONE_MAX_K
        )
        ids = set(result["id"] for result in results["matches"])
        prev = len(all_ids)
        all_ids.update(ids)
        curr = len(all_ids)
        pbar.update(curr - prev)
        # pbar.set_description(f"Iteration {i+1}")
        if len(all_ids) == successful_count:
            break
    print(
        f"dim={dim}, {len(all_ids)} unique ids found, out of {vec_count} total vectors in {i+1} tries"
    )
    # append the above to a csv file

    with open(f"./results2.csv", "a") as f:
        writer = csv.writer(f)
        writer.writerow([dim, len(all_ids), vec_count, i + 1, successful_count])


# print(len(all_ids))

In [5]:
# create a csv file with the following columns
# dim, unique_ids_found, total_vectors, iterations
# iterate through a range of dimensions, and try random search for each
# append the results to the csv file
# import csv
# with open(f'./results2.csv', 'a') as f:
#     writer = csv.writer(f)
#     writer.writerow(['dim', 'unique_ids_found', 'total_vectors', 'iterations', 'successful_count'])

for tot_count in range(1_000, 100_000, 10_000):
    for dim in [768, 1536]:
        print(f"Trying dim={dim}, tot_count={tot_count}")
        try_random_search_for_dim(dim, tot_count=tot_count)

Trying dim=768, tot_count=1000
Created index test-1
Created 1000 vectors


  0%|          | 0/5 [00:00<?, ?it/s]

Successfully upserted 1000 vectors
Vector count(according to Pinecone): 0


  0%|          | 0/1000 [00:00<?, ?it/s]

dim=768, 1000 unique ids found, out of 0 total vectors in 11 tries
Trying dim=1536, tot_count=1000
Created index test-1
Created 1000 vectors


  0%|          | 0/5 [00:00<?, ?it/s]

Successfully upserted 1000 vectors
Vector count(according to Pinecone): 0


  0%|          | 0/1000 [00:00<?, ?it/s]

dim=1536, 1000 unique ids found, out of 0 total vectors in 33 tries
Trying dim=768, tot_count=11000
Created index test-1
Created 11000 vectors


  0%|          | 0/55 [00:00<?, ?it/s]

Successfully upserted 11000 vectors
Vector count(according to Pinecone): 9400


  0%|          | 0/11000 [00:00<?, ?it/s]

dim=768, 11000 unique ids found, out of 9400 total vectors in 10 tries
Trying dim=1536, tot_count=11000
Created index test-1
Created 11000 vectors


  0%|          | 0/55 [00:00<?, ?it/s]

Successfully upserted 11000 vectors
Vector count(according to Pinecone): 10400


  0%|          | 0/11000 [00:00<?, ?it/s]

dim=1536, 11000 unique ids found, out of 10400 total vectors in 10 tries
Trying dim=768, tot_count=21000
Created index test-1
Created 21000 vectors


  0%|          | 0/105 [00:00<?, ?it/s]

Successfully upserted 21000 vectors
Vector count(according to Pinecone): 20600


  0%|          | 0/21000 [00:00<?, ?it/s]

dim=768, 20998 unique ids found, out of 20600 total vectors in 100 tries
Trying dim=1536, tot_count=21000
Created index test-1
Created 21000 vectors


  0%|          | 0/105 [00:00<?, ?it/s]

Successfully upserted 21000 vectors
Vector count(according to Pinecone): 19000


  0%|          | 0/21000 [00:00<?, ?it/s]

dim=1536, 21000 unique ids found, out of 19000 total vectors in 97 tries
Trying dim=768, tot_count=31000
Created index test-1
Created 31000 vectors


  0%|          | 0/155 [00:00<?, ?it/s]

Successfully upserted 31000 vectors
Vector count(according to Pinecone): 30200


  0%|          | 0/31000 [00:00<?, ?it/s]

dim=768, 30972 unique ids found, out of 30200 total vectors in 100 tries
Trying dim=1536, tot_count=31000
Created index test-1
Created 31000 vectors


  0%|          | 0/155 [00:00<?, ?it/s]

Successfully upserted 31000 vectors
Vector count(according to Pinecone): 28600


  0%|          | 0/31000 [00:00<?, ?it/s]

dim=1536, 30975 unique ids found, out of 28600 total vectors in 100 tries
Trying dim=768, tot_count=41000
Created index test-1
Created 41000 vectors


  0%|          | 0/205 [00:00<?, ?it/s]

Successfully upserted 41000 vectors
Vector count(according to Pinecone): 40200


  0%|          | 0/41000 [00:00<?, ?it/s]

dim=768, 40858 unique ids found, out of 40200 total vectors in 100 tries
Trying dim=1536, tot_count=41000
Created index test-1
Created 41000 vectors


  0%|          | 0/205 [00:00<?, ?it/s]

Successfully upserted 41000 vectors
Vector count(according to Pinecone): 40200


  0%|          | 0/41000 [00:00<?, ?it/s]

dim=1536, 40837 unique ids found, out of 40200 total vectors in 100 tries
Trying dim=768, tot_count=51000
Created index test-1
Created 51000 vectors


  0%|          | 0/255 [00:00<?, ?it/s]

Successfully upserted 51000 vectors
Vector count(according to Pinecone): 50200


  0%|          | 0/51000 [00:00<?, ?it/s]

dim=768, 50565 unique ids found, out of 50200 total vectors in 100 tries
Trying dim=1536, tot_count=51000
Created index test-1
Created 51000 vectors


  0%|          | 0/255 [00:00<?, ?it/s]

Error upserting vectors at index 3800
(500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '150', 'x-pinecone-request-latency-ms': '478', 'date': 'Fri, 22 Dec 2023 08:00:40 GMT', 'x-envoy-upstream-service-time': '44', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":13,"message":"We were unable to process your request. If the problem persists, please contact us at https://support.pinecone.io","details":[]}

Successfully upserted 3800 vectors
Vector count(according to Pinecone): 3000


  0%|          | 0/3800 [00:00<?, ?it/s]

dim=1536, 3800 unique ids found, out of 3000 total vectors in 2 tries
Trying dim=768, tot_count=61000
Created index test-1
Created 61000 vectors


  0%|          | 0/305 [00:00<?, ?it/s]

Error upserting vectors at index 0
(500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '150', 'x-pinecone-request-latency-ms': '1390', 'date': 'Fri, 22 Dec 2023 08:00:57 GMT', 'x-envoy-upstream-service-time': '72', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":13,"message":"We were unable to process your request. If the problem persists, please contact us at https://support.pinecone.io","details":[]}

Successfully upserted 0 vectors
Vector count(according to Pinecone): 0


0it [00:00, ?it/s]

dim=768, 0 unique ids found, out of 0 total vectors in 1 tries
Trying dim=1536, tot_count=61000
Created index test-1
Created 61000 vectors


  0%|          | 0/305 [00:00<?, ?it/s]

Successfully upserted 61000 vectors
Vector count(according to Pinecone): 60200


  0%|          | 0/61000 [00:00<?, ?it/s]

dim=1536, 60050 unique ids found, out of 60200 total vectors in 100 tries
Trying dim=768, tot_count=71000
Created index test-1
Created 71000 vectors


  0%|          | 0/355 [00:00<?, ?it/s]

Successfully upserted 71000 vectors
Vector count(according to Pinecone): 69200


  0%|          | 0/71000 [00:00<?, ?it/s]

dim=768, 69312 unique ids found, out of 69200 total vectors in 100 tries
Trying dim=1536, tot_count=71000
Created index test-1
Created 71000 vectors


  0%|          | 0/355 [00:00<?, ?it/s]

Error upserting vectors at index 50600
(500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '150', 'x-pinecone-request-latency-ms': '493', 'date': 'Fri, 22 Dec 2023 08:31:33 GMT', 'x-envoy-upstream-service-time': '53', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":13,"message":"We were unable to process your request. If the problem persists, please contact us at https://support.pinecone.io","details":[]}

Successfully upserted 50600 vectors
Vector count(according to Pinecone): 50000


  0%|          | 0/50600 [00:00<?, ?it/s]

dim=1536, 50159 unique ids found, out of 50000 total vectors in 100 tries
Trying dim=768, tot_count=81000
Created index test-1
Created 81000 vectors


  0%|          | 0/405 [00:00<?, ?it/s]

KeyboardInterrupt: 