In [12]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import pdfplumber
import pandas as pd
import tiktoken
import numpy as np
from numpy.linalg import norm

In [2]:

# Load environment variables from .env file
load_dotenv()
client = OpenAI(
    # api_key=os.getenv('OPENAI_API_KEY_ALI')
    api_key=os.getenv('OPENAI_API_KEY')
)

In [3]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [4]:
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


In [5]:
# load & inspect dataset
input_datapath = "fine_food_reviews_1k.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
        "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...


In [6]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Time").tail(
    top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

1000

In [7]:
# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("fine_food_reviews_with_embeddings_1k.csv")

In [25]:
a = get_embedding("hello", model=embedding_model)
a

[0.016799768432974815,
 -0.055695999413728714,
 0.005626821890473366,
 0.0662013590335846,
 0.0089280866086483,
 -0.0473034530878067,
 -0.0281414445489645,
 0.061212778091430664,
 -0.002217349363490939,
 -0.0437527596950531,
 0.009852440096437931,
 -0.03330609202384949,
 -0.012244023382663727,
 -0.028875060379505157,
 0.014598924666643143,
 0.05827832221984863,
 -0.07001615315675735,
 0.03747301921248436,
 0.010087196715176105,
 0.040788955986499786,
 0.06649480015039444,
 0.01012387778609991,
 -0.005571800749748945,
 0.02363705262541771,
 0.021759001538157463,
 0.011877215467393398,
 -0.0047391485422849655,
 0.019338073208928108,
 0.024341322481632233,
 -0.06491019576787949,
 0.03330609202384949,
 -0.04457440599799156,
 0.02036513388156891,
 -0.004966569133102894,
 0.010791466571390629,
 -0.014826345257461071,
 -0.026278065517544746,
 0.034655939787626266,
 0.0023347276728600264,
 -0.033276744186878204,
 -0.022125808522105217,
 -0.008539270609617233,
 0.05405270308256149,
 0.024400012

In [26]:
b = get_embedding("hey", model=embedding_model)
b

[0.03989354893565178,
 -0.042499154806137085,
 -5.9438338212203234e-05,
 0.09526893496513367,
 -0.0004415137227624655,
 0.0032570036128163338,
 0.015165620483458042,
 0.04358692839741707,
 0.022109678015112877,
 -0.05155552178621292,
 0.03731324523687363,
 -0.04075365141034126,
 -0.034783534705638885,
 -0.030938372015953064,
 -0.00659622298553586,
 0.0070136249996721745,
 -0.07022479176521301,
 -0.004192996770143509,
 -0.00796226691454649,
 0.03746502846479416,
 0.04088013619184494,
 0.040070630609989166,
 0.022286757826805115,
 0.03961528092622757,
 0.01041608676314354,
 0.01071965228766203,
 0.009777335450053215,
 0.00767135014757514,
 0.027750935405492783,
 -0.0033423814456909895,
 0.039362311363220215,
 -0.026207812130451202,
 0.021287523210048676,
 -0.00839864183217287,
 0.003924215212464333,
 -0.022767404094338417,
 -0.03637725114822388,
 0.020351529121398926,
 0.02512003481388092,
 -0.023893125355243683,
 0.01763208955526352,
 0.006023875437676907,
 0.053629882633686066,
 0.0414

In [27]:
# compute cosine similarity
cosine = np.dot(a, b) / (norm(a) * norm(b))
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.7407149443176217
