Pinecone Quick Start Guide
https://docs.pinecone.io/docs/quickstart

In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
import random
import itertools

  from tqdm.autonotebook import tqdm


In [2]:
# Load the Pinecone API key from the .env file
pinecone_api_key = os.environ.get('PINECONE_API_KEY')

In [3]:
# Initialize the Pinecone client
pc = Pinecone(api_key=pinecone_api_key)

In [4]:
# Create a serverless index
pc.create_index(
    name="quickstart",
    dimension=8,
    metric="dotproduct",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-west-2'
    ) 
)

In [5]:
# Initialize the index
index = pc.Index('quickstart')

In [6]:
# Upsert vectors
index.upsert(
  vectors=[
    {"id": "vec1", "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]},
    {"id": "vec2", "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]},
    {"id": "vec3", "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]},
    {"id": "vec4", "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]}
  ],
  namespace="ns1"
)

index.upsert(
  vectors=[
    {"id": "vec5", "values": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]},
    {"id": "vec6", "values": [0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6]},
    {"id": "vec7", "values": [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7]},
    {"id": "vec8", "values": [0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8]}
  ],
  namespace="ns2"
)

{'upserted_count': 4}

In [7]:
# Check the index
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 4}, 'ns2': {'vector_count': 4}},
 'total_vector_count': 8}

In [9]:
# Run a similarity search
index.query(
  namespace="ns1",
  vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
  top_k=3,
  include_values=True
)

{'matches': [{'id': 'vec4',
              'score': 0.960000038,
              'values': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]},
             {'id': 'vec3',
              'score': 0.72,
              'values': [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]},
             {'id': 'vec2',
              'score': 0.480000019,
              'values': [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]}],
 'namespace': 'ns1',
 'usage': {'read_units': 6}}

In [10]:
# Delete the index
#pc.delete_index("quickstart")

In [10]:
# Upsert vectors with meta-data
# If you don't specify namespace, it will default to empty ''
index.upsert(
  vectors=[
    {
      "id": "A", 
      "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 
      "metadata": {"genre": "comedy", "year": 2020}
    },
    {
      "id": "B", 
      "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
      "metadata": {"genre": "documentary", "year": 2019}
    },
    {
      "id": "C", 
      "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
      "metadata": {"genre": "comedy", "year": 2019}
    },
    {
      "id": "D", 
      "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
      "metadata": {"genre": "drama"}
    }
  ]
)

{'upserted_count': 4}

In [11]:
# Check the index
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4},
                'ns1': {'vector_count': 4},
                'ns2': {'vector_count': 4}},
 'total_vector_count': 12}

In [12]:
# Run a similarity search by metadata
# See more filter types at: https://docs.pinecone.io/docs/metadata-filtering
index.query(
    vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    filter={
        "genre": {"$eq": "documentary"}, # "$eq" is a filter type
        "year": 2019
    },
    top_k=1,
    include_metadata=True
)

{'matches': [{'id': 'B',
              'metadata': {'genre': 'documentary', 'year': 2019.0},
              'score': 0.160000011,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [13]:
# Upsert vecters in batches

# Define a helper function to break an iterable into chunks of size batch_size
def chunks(iterable, batch_size=100):
    # Converts the iterable into an iterator using the iter() function
    it = iter(iterable)
    # Creates the first chunk. It does this by taking a slice of the iterator it from its current position up to batch_size elements.
    # The islice() function returns an iterator, which is then converted to a tuple. 
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        # Yields the current chunk to the caller. This makes chunks a generator function, 
        # allowing it to return a sequence of values over time rather than computing them all at once and returning them in a list.
        yield chunk
        # Prepares the next chunk by taking the next batch_size elements from it. 
        # If fewer than batch_size elements are left, it takes all remaining elements. 
        # If there are no elements left, islice() returns an empty iterator, which converts to an empty tuple, causing the loop to exit.
        chunk = tuple(itertools.islice(it, batch_size))


# Set vector dimensions and number of vectors
vector_dim = 8
vector_count = 1000

# Example generator that generates many (id, vector) pairs.
#  The map function applies a lambda function to each item in range(vector_count). 
# The lambda function generates a tuple for each item i in the range, where the first element is a string id-{i}, 
# and the second element is a list of random numbers. The length of this list is determined by vector_dim, 
# and each number in the list is generated by random.random(), which produces a float between 0 and 1.
example_data_generator = map(lambda i: (f'id-{i}', [random.random() for _ in range(vector_dim)]), range(vector_count))

# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(example_data_generator, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk) 

In [14]:
# Check the index
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1004},
                'ns1': {'vector_count': 4},
                'ns2': {'vector_count': 4}},
 'total_vector_count': 1012}

In [15]:
# Upsert vectors with sparse values
upsert_response = index.upsert(
    vectors=[
        {'id': 'vec1',
         'values': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
         'metadata': {'genre': 'drama'},
         'sparse_values': {
             'indices': [10, 45, 16],
             'values': [0.5, 0.5, 0.2]
         }},
        {'id': 'vec2',
         'values': [-0.1, -0.2, -0.3, -0.4, -0.5, -0.6, -0.7, -0.8],
         'metadata': {'genre': 'action'},
         'sparse_values': {
             'indices': [15, 40, 11],
             'values': [0.4, 0.5, 0.2]
         }}
    ],
    namespace='sparse-example'
)

In [16]:
# Query sparse vectors
index.query(
    namespace="sparse-example",
    top_k=1,
    vector=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
    sparse_vector={
        'indices': [10, 45, 12],
        'values':  [0.5, 0.5, 0.2]
    }
)

{'matches': [{'id': 'vec1', 'score': 2.54, 'values': []}],
 'namespace': 'sparse-example',
 'usage': {'read_units': 5}}