In [12]:
import pandas as pd
from openai import OpenAI

import os
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY_PERSONAL'))

df = pd.read_json('../data/processed_data.jsonl', lines=True)

In [13]:
df.head()

Unnamed: 0,title,average_rating,rating_number,features,price,store,thumbnail
0,Mento Streamtail,2.0,1,"[Thermoplastic Rubber sole, High Density Premi...",29.81,Guy Harvey,https://m.media-amazon.com/images/I/31P-uHUUIX...
1,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"[Pull On closure, Size Guide: ""S"" fits calf 10...",17.99,RONNOX,https://m.media-amazon.com/images/I/51CqMDJOOD...
2,LYCKYY Women's Tie Dye Sweatshirt Crewneck Lon...,3.7,52,[Pull On closure],9.99,LYCKYY,https://m.media-amazon.com/images/I/41DW4uwWLQ...
3,Sexyshine Women's Casual Fall Knit Long Sleeve...,3.6,7,"[Cotton Blend, Asian Size,Smaller than US Size...",26.99,Sexyshine,https://m.media-amazon.com/images/I/51Cijpzflv...
4,Result Core Men's Soft Shell II Gilet Red 3XL,5.0,1,"[Polyester,Microfleece,Elastane]",53.95,Result Core,https://m.media-amazon.com/images/I/41vLlRDu6V...


In [19]:
# Create a random sample of 5000 records
df = df.sample(n=4000, random_state=42)


In [20]:
import tiktoken
import concurrent.futures
from tqdm import tqdm

from tenacity import retry, wait_random_exponential, stop_after_attempt

EMBEDDING_MODEL = "text-embedding-3-small"

# Simple function to take in a list of text objects and return them as a list of embeddings
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(10))
def get_embeddings(input):
    response = client.embeddings.create(
        input=input,
        model=EMBEDDING_MODEL
    ).data
    return [data.embedding for data in response]


# Splits an iterable into batches of size n.
def batchify(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx : min(ndx + n, l)]
     

# Function for batching and parallel processing the embeddings
def embed_text(
    text,
    batch_size=64,
    num_workers=12,
    max_context_len=8191,
):
    # Encode the text, truncating to max_context_len
    encoding = tiktoken.get_encoding("cl100k_base")
    encoded_text = [
        encoded_article[:max_context_len] for encoded_article in encoding.encode_batch(text)
    ]
    # Embed the text
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        
        futures = [
            executor.submit(get_embeddings, text_batch)
            for text_batch in batchify(encoded_text, batch_size)
        ]

        with tqdm(total=len(encoded_text)) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(batch_size)

        embeddings = []
        for future in futures:
            data = future.result()
            embeddings.extend(data)

        return embeddings
    

In [21]:
product_names = df['title'].astype(str).tolist()

embeddings = embed_text(product_names)

df['embedding'] = embeddings


4032it [00:12, 319.20it/s]                          


In [22]:
df.to_json('../data/sample_data_with_embeddings.jsonl', orient='records', lines=True)