# Generate Embeddings for text
Author: Cristian Velandia

Creation Date: 2024-03-02

Create embeddings for the vector DB using openai embeddings API

In [2]:
from openai import OpenAI
from tqdm.auto import tqdm
from time import sleep
import pandas as pd
import json

### Load Dataset

In [4]:
data_path = r"D:\Documents\GitHub\knowledge_pal_assistant\2_outputs\chunks.parquet"
chunks = pd.read_parquet(data_path, engine = "pyarrow")

In [5]:
chunks.head()

Unnamed: 0,id,page_content,metadata,tokens,nostopw_page_content
0,0-0,# AWS::Events::Rule SageMakerPipelineParameter...,{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, AWS, :, :Events, :, :Rule, SageMakerPipeli...",# AWS : :Events : :Rule SageMakerPipelineParam...
1,0-1,"## Syntax<a name=""aws-properties-events-rule-s...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Syntax, <, a, name=, '', aws-properties...",# # Syntax < name= '' aws-properties-events-ru...
2,0-2,"## Properties<a name=""aws-properties-events-ru...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Properties, <, a, name=, '', aws-proper...",# # Properties < name= '' aws-properties-event...
3,1-0,# Automating Amazon SageMaker with Amazon Even...,{'Header 1': 'Automating Amazon SageMaker with...,"[#, Automating, Amazon, SageMaker, with, Amazo...",# Automating Amazon SageMaker Amazon EventBrid...
4,1-1,"## Training job state change<a name=""eventbrid...",{'Header 1': 'Automating Amazon SageMaker with...,"[#, #, Training, job, state, change, <, a, nam...",# # Training job state change < name= '' event...


In [6]:
chunks.shape

(1046, 5)

In [7]:
chunks["token_length"] = chunks["tokens"].apply(lambda x: len(x))
chunks.reset_index(drop = False, inplace = True)

### Connect to OpenAI

In [6]:
# get api keys
creds =json.load(open('personal_creds.json'))

OPENAI_API_KEY = creds["OPENAI_API_KEY"]

In [7]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)

### Generate Embeddings

In [8]:
# To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002). API limits 200 CPD, 150000 TPM free tier, 3 requests per minute
#chunks["vectors"] = chunks["page_content"].apply(lambda x: openai_client.embeddings.create(input = x, model = "text-embedding-ada-002"))


#Create Batches and request embedings
corpus_size = chunks.shape[0]
counter = 1
batch_size = 5
valid_batch = 0
vect_df = pd.DataFrame()
toggle = False
max_calls = 100 

while corpus_size > 0:
    batch = chunks[(chunks["index"] >= batch_size*(counter-1)) & (chunks["index"] < batch_size*counter)]
    corpus_size -= batch.shape[0]
    text_list = batch["page_content"].to_list()
    total_tokens = batch["token_length"].sum()
    
    if total_tokens <= 8192:
        valid_batch += 1
        print("ammount of valid batches: ", valid_batch)
        valid_ids = batch["id"].tolist()

        #Make sure to not do more than 3 requests per minute, try 10 times 20 secs spaced
        for attempt in range(10):
            try:
                #Call Embeddings API
                embeddings = openai_client.embeddings.create(input = text_list, model = "text-embedding-ada-002",)
            except Exception as error:
                print(type(error).__name__, "-", error, "now sleep")
                if "RPD" in str(error): #handle max requests per day
                    toggle = True
                    break
                else: #handle max requests per minute 
                    sleep(60)
            else:
                print("ok")
                break
        
        vect_df = pd.concat([vect_df, pd.DataFrame.from_dict(dict(zip(valid_ids, [[e.embedding] for e in embeddings.data])), orient='index', columns=["vect"])])

    if toggle or valid_batch >= 100:
        print("Terminating due to max requests exceeded")
        break

        
    
    print("count {}, batch {}, tokens in batch {}, remaining rows {}, vectors generated {}".format(counter, batch.shape[0], total_tokens, corpus_size, vect_df.shape[0]))
        
    counter += 1

valid_batch

ammount of valid batches:  1
ok
count 1, batch 5, tokens in batch 1077, remaining rows 1041, vectors generated 5
ammount of valid batches:  2
ok
count 2, batch 5, tokens in batch 1757, remaining rows 1036, vectors generated 10
ammount of valid batches:  3
ok
count 3, batch 5, tokens in batch 1897, remaining rows 1031, vectors generated 15
ammount of valid batches:  4
ok
count 4, batch 5, tokens in batch 1030, remaining rows 1026, vectors generated 20
ammount of valid batches:  5
ok
count 5, batch 5, tokens in batch 777, remaining rows 1021, vectors generated 25
ammount of valid batches:  6
ok
count 6, batch 5, tokens in batch 474, remaining rows 1016, vectors generated 30
ammount of valid batches:  7
ok
count 7, batch 5, tokens in batch 483, remaining rows 1011, vectors generated 35
ammount of valid batches:  8
ok
count 8, batch 5, tokens in batch 804, remaining rows 1006, vectors generated 40
ammount of valid batches:  9
ok
count 9, batch 5, tokens in batch 575, remaining rows 1001, v

100

In [9]:
vectorized_chunks = chunks.merge(vect_df.reset_index(drop=False), how="inner", left_on="id", right_on="index")

In [10]:
vectorized_chunks.head()

Unnamed: 0,index_x,id,page_content,metadata,tokens,nostopw_page_content,token_length,index_y,vect
0,0,0-0,# AWS::Events::Rule SageMakerPipelineParameter...,{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, AWS, :, :Events, :, :Rule, SageMakerPipeli...",# AWS : :Events : :Rule SageMakerPipelineParam...,39,0-0,"[-0.036196205765008926, 0.024841105565428734, ..."
1,1,0-1,"## Syntax<a name=""aws-properties-events-rule-s...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Syntax, <, a, name=, '', aws-properties...",# # Syntax < name= '' aws-properties-events-ru...,114,0-1,"[-0.03795338422060013, 0.021498069167137146, 0..."
2,2,0-2,"## Properties<a name=""aws-properties-events-ru...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Properties, <, a, name=, '', aws-proper...",# # Properties < name= '' aws-properties-event...,164,0-2,"[-0.03936085104942322, 0.019204849377274513, 0..."
3,3,1-0,# Automating Amazon SageMaker with Amazon Even...,{'Header 1': 'Automating Amazon SageMaker with...,"[#, Automating, Amazon, SageMaker, with, Amazo...",# Automating Amazon SageMaker Amazon EventBrid...,315,1-0,"[-0.0347835049033165, -0.01207298319786787, -0..."
4,4,1-1,"## Training job state change<a name=""eventbrid...",{'Header 1': 'Automating Amazon SageMaker with...,"[#, #, Training, job, state, change, <, a, nam...",# # Training job state change < name= '' event...,445,1-1,"[-0.035871874541044235, 0.0021414232905954123,..."


In [11]:
save_folder = "D:\\Documents\GitHub\\knowledge_pal_assistant\\2_outputs\\"
vectorized_chunks.to_parquet(save_folder + "vectors.parquet", index = False, engine = "pyarrow", compression= "brotli")