# Pipelines 2: Scaling the Pinecone vector store

copyright 2024, Denis Rothman





#Installing the environment

In [18]:
#API Key
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
!pip install openai==1.33.0
!pip install pinecone-client==4.1.1



In [20]:
f = open("drive/MyDrive/files/pinecone.txt", "r")
PINECONE_API_KEY=f.readline()
f.close()

In [21]:
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

In [22]:
#The OpenAI Key
import os
import openai
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

# Processing bank churn dataset

RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
    Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CustomerId          10000 non-null  int64  
 1   CreditScore         10000 non-null  int64  
 2   Age                 10000 non-null  int64  
 3   Tenure              10000 non-null  int64  
 4   Balance             10000 non-null  float64  
 5   NumOfProducts       10000 non-null  int64  
 6   HasCrCard           10000 non-null  int64  
 7   IsActiveMember      10000 non-null  int64  
 8   EstimatedSalary     10000 non-null  float64  
 9   Exited              10000 non-null  int64  
 10  Complain            10000 non-null  int64  
 11  Satisfaction Score  10000 non-null  int64  
 12  Card Type           10000 non-null  object   
 13  Point Earned        10000 non-null  int64    
dtypes: float64(2), int64(11), object(1)
memory usage: 1.1+ MB

In [None]:
!cp /content/drive/MyDrive/files/rag_c6/data1.csv /content/data1.csv

In [None]:
import pandas as pd
# Load the CSV file
file_path = '/content/data1.csv'
data1 = pd.read_csv(file_path)

In [None]:
# Count the chunks
number_of_lines = len(data1)
print("Number of lines: ",number_of_lines)

Number of lines:  10000


In [None]:
import pandas as pd

# Initialize an empty list to store the lines
output_lines = []

# Iterate over each row in the DataFrame
for index, row in data1.iterrows():
    # Create a list of "column_name: value" for each column in the row
    row_data = [f"{col}: {row[col]}" for col in data1.columns]
    # Join the list into a single string separated by spaces
    line = ' '.join(row_data)
    # Append the line to the output list
    output_lines.append(line)

# Display or further process `output_lines` as needed
for line in output_lines[:5]:  # Displaying first 5 lines for preview
    print(line)

CustomerId: 15634602 CreditScore: 619 Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464
CustomerId: 15647311 CreditScore: 608 Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
CustomerId: 15619304 CreditScore: 502 Age: 42 Tenure: 8 Balance: 159660.8 NumOfProducts: 3 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 113931.57 Exited: 1 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 377
CustomerId: 15701354 CreditScore: 699 Age: 39 Tenure: 1 Balance: 0.0 NumOfProducts: 2 HasCrCard: 0 IsActiveMember: 0 EstimatedSalary: 93826.63 Exited: 0 Complain: 0 Satisfaction Score: 5 Card Type: GOLD Point Earned: 350
CustomerId: 15737888 CreditScore: 850 Age: 43 Tenure: 2 Balance: 125510.82 NumOfProducts: 1 Ha

In [None]:
lines = output_lines.copy()

In [None]:
# Count the lines
number_of_lines = len(lines)
print("Number of lines: ",number_of_lines)

Number of lines:  10000


# Chunking and embedding the dataset

## Chunking

In [None]:
import time
start_time = time.time()  # Start timing before the request

In [None]:
# Initialize an empty list for the chunks
chunks = []

# Add each line as a separate chunk to the chunks list
for line in lines:
    chunks.append(line)  # Each line becomes its own chunk

# Now, each line is treated as a separate chunk
print(f"Total number of chunks: {len(chunks)}")

Total number of chunks: 10000


In [None]:
response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

Response Time: 0.03 seconds


In [None]:
# Print the length and content of the first 10 chunks
for i in range(3):
    print(len(chunks[i]))
    print(chunks[i])

224
CustomerId: 15634602 CreditScore: 619 Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464
229
CustomerId: 15647311 CreditScore: 608 Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
229
CustomerId: 15619304 CreditScore: 502 Age: 42 Tenure: 8 Balance: 159660.8 NumOfProducts: 3 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 113931.57 Exited: 1 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 377


## Embedding

### Initializing the embedding model


In [23]:
import openai
import time

#embedding_model="text-embedding-3-small"
#embedding_model="text-embedding-3-large"
embedding_model="text-embedding-ada-002"

# Initialize the OpenAI client
client = openai.OpenAI()

def get_embedding(text, model=embedding_model):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    return embedding
'''
# Uncomment the code to choose the best embedding model for a project
# Chunk
text = "CustomerId: 15634602 CreditScore: 619 Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464"


for t in range(10):
  # Choose the best embedding model for a project
  start_time = time.time()  # Start timing before the request
  embedding_model="text-embedding-ada-002"
  embedding = get_embedding(text,model=embedding_model)
  response_time = time.time() - start_time  # Measure response time
  print("Embedding:", embedding)
  print(f"ada response Time: {response_time:.2f} seconds")  # Print response time

  start_time = time.time()  # Start timing before the request
  embedding_model="text-embedding-3-small"
  embedding = get_embedding(text,model=embedding_model)
  response_time = time.time() - start_time  # Measure response time
  print("Embedding:", embedding)
  print(f"Text-small response Time: {response_time:.2f} seconds")  # Print response time

  start_time = time.time()  # Start timing before the request
  embedding_model="text-embedding-3-large"
  embedding = get_embedding(text,model=embedding_model)
  response_time = time.time() - start_time  # Measure response time
  print("Embedding:", embedding)
  print(f"Text-large response Time: {response_time:.2f} seconds")  # Print response time
'''

'\n# Uncomment the code to choose the best embedding model for a project\n# Chunk\ntext = "CustomerId: 15634602 CreditScore: 619 Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464"\n\n\nfor t in range(10):\n  # Choose the best embedding model for a project\n  start_time = time.time()  # Start timing before the request\n  embedding_model="text-embedding-ada-002"\n  embedding = get_embedding(text,model=embedding_model)\n  response_time = time.time() - start_time  # Measure response time\n  print("Embedding:", embedding)\n  print(f"ada response Time: {response_time:.2f} seconds")  # Print response time\n\n  start_time = time.time()  # Start timing before the request\n  embedding_model="text-embedding-3-small"\n  embedding = get_embedding(text,model=embedding_model)\n  response_time = time.time() - start_time  # Measure response time\n  print("Embedding:",

### Embedding

In [24]:
import openai
import time

# Initialize the OpenAI client
client = openai.OpenAI()

# Initialize variables
start_time = time.time()  # Start timing before the request
chunk_start = 0
chunk_end = 1000
pause_time = 3
embeddings = []
counter = 1

while chunk_end <= len(chunks):
    # Select the current batch of chunks
    chunks_to_embed = chunks[chunk_start:chunk_end]

    # Initialize an empty list to store the embeddings
    current_embeddings = []
    ecounter = 0  # Initialize a counter to track progress of each embedding processed
    # Loop over each chunk
    for chunk in chunks_to_embed:
      #print(f"Processing chunk {ecounter + 1}/{len(chunks_to_embed)}...")
      embedding = get_embedding(chunk, model=embedding_model)
      current_embeddings.append(embedding)
      ecounter += 1  # Increment the counter after processing each chunk

    print("All chunks processed.")

    # Append the embeddings to the list
    embeddings.extend(current_embeddings)

    # Print counter and sleep
    print(f"Batch {counter} embedded.")
    counter += 1
    time.sleep(pause_time)

    # Update the chunk indices
    chunk_start += 1000
    chunk_end += 1000

# Process the remaining chunks if any
if chunk_end < len(chunks):
    remaining_chunks = chunks[chunk_end:]
    remaining_embeddings = [get_embedding(chunk, model=embedding_model) for chunk in remaining_chunks]
    embeddings.extend(remaining_embeddings)

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

All chunks processed.
Batch 1 embedded.
All chunks processed.
Batch 2 embedded.
All chunks processed.
Batch 3 embedded.
All chunks processed.
Batch 4 embedded.
All chunks processed.
Batch 5 embedded.
All chunks processed.
Batch 6 embedded.
All chunks processed.
Batch 7 embedded.
All chunks processed.
Batch 8 embedded.
All chunks processed.
Batch 9 embedded.
All chunks processed.
Batch 10 embedded.
Response Time: 3437.69 seconds


In [None]:
print("First embedding:", embeddings[0])

First embedding overall structure: [-0.024449337273836136, -0.00936567410826683, 0.021215716376900673, -0.043824780732393265, -0.01115993969142437, 0.004347644280642271, -0.01975664310157299, 0.005810003727674484, -0.025146013125777245, -0.017167115584015846, 0.017206549644470215, 0.013033074326813221, -0.014761616475880146, -0.03141608461737633, 0.005382797680795193, 0.026026714593172073, 0.008866171352565289, -0.016076097264885902, 0.006907595321536064, -0.014827340841293335, -0.04198450967669487, 0.00994404498487711, -0.0409066341817379, 0.005885586608201265, -0.00723621528595686, -0.023923546075820923, 0.008787302300333977, -0.002464650897309184, 0.009733728133141994, -0.015182250179350376, 0.014051796868443489, -0.0027801264077425003, -0.01695679873228073, -0.008695288561284542, -0.019388588145375252, 0.004209624137729406, -0.003404504619538784, 0.02773553878068924, 0.016102386638522148, 0.005642407573759556, 0.019204560667276382, 0.008195785805583, -0.005287497770041227, 0.012487

In [26]:
# Check the lengths of the chunks and embeddings
num_chunks = len(chunks)
print(f"Number of chunks: {num_chunks}")
print(f"Number of embeddings: {len(embeddings)}")

Number of chunks: 10000
Number of embeddings: 10000


#  The Pinecone index

In [27]:
import os
from pinecone import Pinecone, ServerlessSpec

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

In [40]:
from pinecone import ServerlessSpec

index_name = 'bank-index-2'
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [41]:
import time
import pinecone
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1017581}},
 'total_vector_count': 1017581}

# Upinserting

In [30]:
# upsert function
def upsert_to_pinecone(data, batch_size=100):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        index.upsert(vectors=batch)
        #time.sleep(1)  # Optional: add delay to avoid rate limits

In [31]:
# Print the query results along with metadata
def display_results(query_results):
  for match in query_results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")
    if 'metadata' in match and 'text' in match['metadata']:
        print(f"Text: {match['metadata']['text']}")
    else:
        print("No metadata available.")

In [32]:
import openai
import pinecone
import time

#batches
bnb=2
# Initial seed for ids
#seed = 1000000000 #initial seed
seed = 10 #seed based on vector store metrics
ids = []
for i in range(10000):
  seed+=1
  ids.append(seed)

# Upsert data to Pinecone
for b in range(bnb):
  start_time = time.time()  # Start timing before the request
  #data for upsert
  if b>0:
    ids = []
    for i in range(10000):
      seed+=1
      ids.append(seed)
  print("Id seed", seed)
  chunks = chunks  # List of text data
  embeddings = embeddings  # List of corresponding embeddings
  data_for_upsert = [
      {"id": str(ids[i]), "values": emb, "metadata": {"text": chunk}}
      for i, (chunk, emb) in enumerate(zip(chunks, embeddings))]
  upsert_to_pinecone(data_for_upsert)
  response_time = time.time() - start_time  # Measure response time
  print(f"Upsertion response time: {response_time:.2f} seconds")  # Print response time
  print("Index stats")
  print(index.describe_index_stats(include_metadata=True))
  print("Upsert batch", b,"complete")
  if b==0 or b==1:
    print("Querying vector store")
    start_time = time.time()  # Start timing before the request
    query_text = "Customer Paulsen CreditScore 619Age 18 Tenure 2Balance 0.0NumOfProducts 1HasCrCard 1IsActiveMember 1EstimatedSalary 101348.88Exited 1Complain 1Satisfaction Score 2Card Type DIAMONDPoint Earned 464"
    query_embedding = get_embedding(query_text,model=embedding_model)
    query_results = index.query(vector=query_embedding, top_k=1, include_metadata=True)  # Request metadata
    #print("raw query_results",query_results)
    print("processed query results")
    display_results(query_results) #display results
    response_time = time.time() - start_time              # Measure response time
    print(f"Querying response time: {response_time:.2f} seconds")  # Print response time

Id seed 10010
Upsertion response time: 93.80 seconds
Index stats
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 9800}},
 'total_vector_count': 9800}
Upsert batch 0 complete
Querying vector store
processed query results
ID: 9531, Score: 0.941800833
Text: CustomerId: 15673180 CreditScore: 727 Age: 18 Tenure: 2 Balance: 93816.7 NumOfProducts: 2 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 126172.11 Exited: 0 Complain: 0 Satisfaction Score: 1 Card Type: DIAMOND Point Earned: 523
Querying response time: 0.25 seconds
Id seed 20010
Upsertion response time: 94.05 seconds
Index stats
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20300}},
 'total_vector_count': 20300}
Upsert batch 1 complete
Querying vector store
processed query results
ID: 9531, Score: 0.941800833
Text: CustomerId: 15673180 CreditScore: 727 Age: 18 Tenure: 2 Balance: 93816.7 NumOfProducts: 2 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 126172.11 Exited: 

In [42]:
print(index.describe_index_stats(include_metadata=True))

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1017581}},
 'total_vector_count': 1017581}


In [None]:
# Close the Pinecone client (optional but good practice)
#pc.deinit()