# Pipelines 2: Scaling the Pinecone vector store

copyright 2024, Denis Rothman





#Installing the environment

In [1]:
#API Key
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install openai==1.40.3
!pip install pinecone-client==5.0.1

Collecting openai==1.40.3
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai==1.40.3)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai==1.40.3)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.40.3)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.40.3)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.40.3-py3-none-any.whl (360 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.7/360.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5

In [3]:
f = open("drive/MyDrive/files/pinecone.txt", "r")
PINECONE_API_KEY=f.readline()
f.close()

In [4]:
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

In [5]:
#The OpenAI Key
import os
import openai
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

# Processing bank churn dataset

RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
    Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CustomerId          10000 non-null  int64  
 1   CreditScore         10000 non-null  int64  
 2   Age                 10000 non-null  int64  
 3   Tenure              10000 non-null  int64  
 4   Balance             10000 non-null  float64  
 5   NumOfProducts       10000 non-null  int64  
 6   HasCrCard           10000 non-null  int64  
 7   IsActiveMember      10000 non-null  int64  
 8   EstimatedSalary     10000 non-null  float64  
 9   Exited              10000 non-null  int64  
 10  Complain            10000 non-null  int64  
 11  Satisfaction Score  10000 non-null  int64  
 12  Card Type           10000 non-null  object   
 13  Point Earned        10000 non-null  int64    
dtypes: float64(2), int64(11), object(1)
memory usage: 1.1+ MB

You can retrieve `data1.csv` from Google Drive that was generated in `Pipeline_1_Collecting_and_preparing_the_dataset.ipynb` and saved in your Google Drive

or

You can download the ready-to-use `data1.csv` from the GitHub repository.


In [6]:
#Run this cell if you saved data.csv in your Google Drive
#or comment this cell and run the next cell.
!cp /content/drive/MyDrive/files/rag_c6/data1.csv /content/data1.csv

cp: cannot stat '/content/drive/MyDrive/files/rag_c6/data1.csv': No such file or directory


In [8]:
#Uncomment the following command if you want to retrieve the ready-to-use data1.csv file
#!curl -o data1.csv https://raw.githubusercontent.com/Denis2054/RAG-Driven-Generative-AI/main/Chapter06/data1.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    14  100    14    0     0    113      0 --:--:-- --:--:-- --:--:--   113


In [9]:
import pandas as pd
# Load the CSV file
file_path = '/content/data1.csv'
data1 = pd.read_csv(file_path)

In [10]:
# Count the chunks
number_of_lines = len(data1)
print("Number of lines: ",number_of_lines)

Number of lines:  10000


In [11]:
import pandas as pd

# Initialize an empty list to store the lines
output_lines = []

# Iterate over each row in the DataFrame
for index, row in data1.iterrows():
    # Create a list of "column_name: value" for each column in the row
    row_data = [f"{col}: {row[col]}" for col in data1.columns]
    # Join the list into a single string separated by spaces
    line = ' '.join(row_data)
    # Append the line to the output list
    output_lines.append(line)

# Display or further process `output_lines` as needed
for line in output_lines[:5]:  # Displaying first 5 lines for preview
    print(line)

CustomerId: 15634602 CreditScore: 619 Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464
CustomerId: 15647311 CreditScore: 608 Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
CustomerId: 15619304 CreditScore: 502 Age: 42 Tenure: 8 Balance: 159660.8 NumOfProducts: 3 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 113931.57 Exited: 1 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 377
CustomerId: 15701354 CreditScore: 699 Age: 39 Tenure: 1 Balance: 0.0 NumOfProducts: 2 HasCrCard: 0 IsActiveMember: 0 EstimatedSalary: 93826.63 Exited: 0 Complain: 0 Satisfaction Score: 5 Card Type: GOLD Point Earned: 350
CustomerId: 15737888 CreditScore: 850 Age: 43 Tenure: 2 Balance: 125510.82 NumOfProducts: 1 Ha

In [12]:
lines = output_lines.copy()

In [13]:
# Count the lines
number_of_lines = len(lines)
print("Number of lines: ",number_of_lines)

Number of lines:  10000


# Chunking and embedding the dataset

## Chunking

In [14]:
import time
start_time = time.time()  # Start timing before the request

# Initialize an empty list for the chunks
chunks = []

# Add each line as a separate chunk to the chunks list
for line in lines:
    chunks.append(line)  # Each line becomes its own chunk

# Now, each line is treated as a separate chunk
print(f"Total number of chunks: {len(chunks)}")

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

Total number of chunks: 10000
Response Time: 0.01 seconds


In [15]:
# Print the length and content of the first 10 chunks
for i in range(3):
    print(len(chunks[i]))
    print(chunks[i])

224
CustomerId: 15634602 CreditScore: 619 Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464
229
CustomerId: 15647311 CreditScore: 608 Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
229
CustomerId: 15619304 CreditScore: 502 Age: 42 Tenure: 8 Balance: 159660.8 NumOfProducts: 3 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 113931.57 Exited: 1 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 377


## Embedding

### Initializing the embedding model


In [None]:
import openai
import time

embedding_model="text-embedding-3-small"
#embedding_model="text-embedding-3-large"
#embedding_model="text-embedding-ada-002"

# Initialize the OpenAI client
client = openai.OpenAI()

def get_embedding(text, model=embedding_model):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    return embedding


def get_embedding(texts, model="text-embedding-3-small"):
    texts = [text.replace("\n", " ") for text in texts]  # Clean input texts
    response = client.embeddings.create(input=texts, model=model)  # API call for batch
    embeddings = [res.embedding for res in response.data]  # Extract embeddings
    return embeddings


### Embedding the chunks

    Parameters:
        chunks (list): List of text chunks to be embedded.
        embedding_model (str): Model to be used for embedding.
        batch_size (int): Number of chunks to process per batch.
        pause_time (int): Time to wait between batches (in seconds).
    

In [16]:
def embed_chunks(chunks, embedding_model="text-embedding-3-small", batch_size=1000, pause_time=3):
    start_time = time.time()  # Start timing the operation
    embeddings = []  # Initialize an empty list to store the embeddings
    counter = 1  # Batch counter

    # Process chunks in batches
    for i in range(0, len(chunks), batch_size):
        chunk_batch = chunks[i:i + batch_size]  # Select a batch of chunks

        # Get the embeddings for the current batch
        current_embeddings = get_embedding(chunk_batch, model=embedding_model)

        # Append the embeddings to the final list
        embeddings.extend(current_embeddings)

        # Print batch progress and pause
        print(f"Batch {counter} embedded.")
        counter += 1
        time.sleep(pause_time)  # Optional: adjust or remove this depending on rate limits

    # Print total response time
    response_time = time.time() - start_time
    print(f"Total Response Time: {response_time:.2f} seconds")

    return embeddings

embeddings = embed_chunks(chunks)


Batch 1 embedded.
Batch 2 embedded.
Batch 3 embedded.
Batch 4 embedded.
Batch 5 embedded.
Batch 6 embedded.
Batch 7 embedded.
Batch 8 embedded.
Batch 9 embedded.
Batch 10 embedded.
Total Response Time: 53.71 seconds


In [17]:
print("First embedding:", embeddings[0])

First embedding: [0.00471159815788269, -0.008987484499812126, 0.000833576254080981, 0.03757462650537491, 0.022479787468910217, -0.02569962665438652, -0.010036147199571133, 0.07893034815788269, 0.00973336398601532, -0.04953824356198311, 0.05131063237786293, -0.04587531089782715, -0.028136659413576126, -0.007085859309881926, 0.0701865628361702, 0.046407025307416916, -0.03648165240883827, 0.020072294399142265, -0.025418998673558235, 0.04915422573685646, 0.03479788452386856, 0.06735074520111084, 0.015981031581759453, -0.021992381662130356, 0.0013634463539347053, 0.037545084953308105, -0.05754353106021881, -0.041621576994657516, 0.03550684079527855, -0.05760261043906212, 0.060911066830158234, -0.03317319601774216, 0.027796952053904533, 0.0382835790514946, 0.014378497377038002, 0.02530083805322647, -0.0003210145514458418, 0.050069961696863174, 0.036511193960905075, -0.04682058468461037, -0.05012904107570648, 0.02265702746808529, 0.017901118844747543, -9.767750452738255e-05, -0.01446711737662

Control output

In [18]:
# Check the lengths of the chunks and embeddings
num_chunks = len(chunks)
print(f"Number of chunks: {num_chunks}")
print(f"Number of embeddings: {len(embeddings)}")

Number of chunks: 10000
Number of embeddings: 10000


In [19]:
# Define the duplication size
dsize = 5  # You can set this to any value between 1 and n as per your experimentation requirements
total=dsize * len(chunks)
print("Total size", total)
# Initialize new lists for duplicated chunks and embeddings
duplicated_chunks = []
duplicated_embeddings = []

# Loop through the original lists and duplicate each entry
for i in range(len(chunks)):
    for _ in range(dsize):
        duplicated_chunks.append(chunks[i])
        duplicated_embeddings.append(embeddings[i])

# Checking the lengths of the duplicated lists
print(f"Number of duplicated chunks: {len(duplicated_chunks)}")
print(f"Number of duplicated embeddings: {len(duplicated_embeddings)}")

Total size 50000
Number of duplicated chunks: 50000
Number of duplicated embeddings: 50000


#  The Pinecone index

In [21]:
import os
from pinecone import Pinecone, ServerlessSpec

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

In [20]:
from pinecone import ServerlessSpec

index_name = 'bank-index-50000'
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [22]:
import time
import pinecone
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimension of the embedding model
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

# Upserting

In [23]:
# upsert function
def upsert_to_pinecone(data, batch_size):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        index.upsert(vectors=batch)
        #time.sleep(1)  # Optional: add delay to avoid rate limits

In [24]:
import pinecone
import time
import sys

start_time = time.time()  # Start timing before the request

# Function to calculate the size of a batch
def get_batch_size(data, limit=4000000):  # limit set to 4MB to be safe
    total_size = 0
    batch_size = 0
    for item in data:
        item_size = sum([sys.getsizeof(v) for v in item.values()])
        if total_size + item_size > limit:
            break
        total_size += item_size
        batch_size += 1
    return batch_size

def batch_upsert(data):
    total = len(data)
    i = 0
    while i < total:
        batch_size = get_batch_size(data[i:])
        batch = data[i:i + batch_size]
        if batch:
            upsert_to_pinecone(batch,batch_size)
            i += batch_size
            print(f"Upserted {i}/{total} items...")  # Display current progress
        else:
            break
    print("Upsert complete.")

# Generate IDs for each data item
ids = [str(i) for i in range(1, len(duplicated_chunks) + 1)]

# Prepare data for upsert
data_for_upsert = [
    {"id": str(id), "values": emb, "metadata": {"text": chunk}}
    for id, (chunk, emb) in zip(ids, zip(duplicated_chunks, duplicated_embeddings))
]

# Upsert data in batches
batch_upsert(data_for_upsert)

response_time = time.time() - start_time  # Measure response time
print(f"Upsertion response time: {response_time:.2f} seconds")  # Print response time

Upserted 316/50000 items...
Upserted 632/50000 items...
Upserted 948/50000 items...
Upserted 1264/50000 items...
Upserted 1580/50000 items...
Upserted 1896/50000 items...
Upserted 2212/50000 items...
Upserted 2528/50000 items...
Upserted 2844/50000 items...
Upserted 3160/50000 items...
Upserted 3476/50000 items...
Upserted 3792/50000 items...
Upserted 4108/50000 items...
Upserted 4424/50000 items...
Upserted 4740/50000 items...
Upserted 5056/50000 items...
Upserted 5372/50000 items...
Upserted 5688/50000 items...
Upserted 6004/50000 items...
Upserted 6320/50000 items...
Upserted 6636/50000 items...
Upserted 6952/50000 items...
Upserted 7268/50000 items...
Upserted 7584/50000 items...
Upserted 7900/50000 items...
Upserted 8216/50000 items...
Upserted 8532/50000 items...
Upserted 8848/50000 items...
Upserted 9164/50000 items...
Upserted 9480/50000 items...
Upserted 9796/50000 items...
Upserted 10112/50000 items...
Upserted 10428/50000 items...
Upserted 10744/50000 items...
Upserted 11060

In [25]:
print("Index stats")
print(index.describe_index_stats(include_metadata=True))

Index stats
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 50000}},
 'total_vector_count': 50000}


## Querying the index

In [26]:
# Print the query results along with metadata
def display_results(query_results):
  for match in query_results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")
    if 'metadata' in match and 'text' in match['metadata']:
        print(f"Text: {match['metadata']['text']}")
    else:
        print("No metadata available.")

In [27]:
embedding_model = "text-embedding-3-small"
def get_embedding(text, model=embedding_model):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    return embedding

In [28]:
import openai
# Initialize the OpenAI client
client = openai.OpenAI()

print("Querying vector store")
start_time = time.time()  # Start timing before the request
query_text = "Customer Robertson CreditScore 632Age 21 Tenure 2Balance 0.0NumOfProducts 1HasCrCard 1IsActiveMember 1EstimatedSalary 99000 Exited 1Complain 1Satisfaction Score 2Card Type DIAMONDPoint Earned 399"
query_embedding = get_embedding(query_text,model=embedding_model)
query_results = index.query(vector=query_embedding, top_k=1, include_metadata=True)  # Request metadata
#print("raw query_results",query_results)
print("processed query results")
display_results(query_results) #display results
response_time = time.time() - start_time              # Measure response time
print(f"Querying response time: {response_time:.2f} seconds")  # Print response time

Querying vector store
processed query results
ID: 1686, Score: 0.824840188
Text: CustomerId: 15648064 CreditScore: 649 Age: 33 Tenure: 2 Balance: 0.0 NumOfProducts: 2 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 2010.98 Exited: 0 Complain: 0 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 720
Querying response time: 0.52 seconds


In [None]:
# Close the Pinecone client (optional but good practice)
#pc.deinit()