In [1]:
import openai
import pandas as pd
import numpy as np
import regex as re
import pickle
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_json("data/Amazon_reviews.json", lines=True)

In [3]:
# What are we working with? 
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [4]:
# Remove unnecessary columns & Cleanup 
df = df[["reviewText", "overall", "summary"]].dropna()
df = df.sample(250)

In [5]:
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\&", " and ", text)
    text = re.sub(r"\|", " ", text)
    text = re.sub(r"\s+", " ", text)
    # Eliminate all punctuation
    text = re.sub(r"[^\w\d\s]", "", text)
    return text.strip()

df["reviewText"] = df["reviewText"].apply(clean_text)
df["summary"] = df["summary"].apply(clean_text)

In [6]:
for _, row in df.head(5).iterrows():
    print("Summary:", row["summary"])
    print("Rating:", row["overall"])
    print("Review:", row["reviewText"])
    print()

Summary: AMAZING
Rating: 5
Review: I bought these because they were the only screen protectors that were less than one dollar when including shipping To be honest i didnt expect much from them I was really surprised when I received them quickly Let me tell you something about these screen protectors THEY ARE AWESOME Ok i only had them for a few days so i dont know how durable they are but right now the are amazing I got the matte one and i really does reduce fingerprints like a lot Applying the screen protector was easy If dust gets on it just use scotch tape to get it off IT REALLY DOES WORK SURPRISING RIGHTAnyways this was a good purchase

Summary: Good product
Rating: 4
Review: Its a good product covers the phone nicely and has cut out holes for all the required connections and antennas

Summary: Nice
Rating: 4
Review: Was upset the head phones that came with my note 2 broke so ordered more there ok but it seems like cheap material guess I have to order some every time they break bu

In [7]:
# make new column that appends Summary and review. 
# this will be the input to the model
df["text"] = ("Summary: "+ df["summary"] + "; Review: " + df["reviewText"])

In [12]:
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
client = openai.OpenAI()

def get_embedding(text: str, model: str = EMBEDDING_MODEL):
    return client.embeddings.create(input = [text], model=model).data[0].embedding
get_embedding("Hello world")

[-0.00554704200476408,
 0.00476812245324254,
 -0.015023370273411274,
 -0.0271455030888319,
 -0.01516212522983551,
 0.015174739062786102,
 -0.017647098749876022,
 0.009555166587233543,
 -0.009347033686935902,
 -0.03085404261946678,
 0.026186833158135414,
 0.011176075786352158,
 -0.023348664864897728,
 -0.0095047103241086,
 0.007745045702904463,
 0.01047599408775568,
 0.02752392552793026,
 -0.012506861239671707,
 0.012860056012868881,
 0.014846773818135262,
 -0.007152183912694454,
 -0.003304889192804694,
 0.0026363430079072714,
 0.007202640175819397,
 -0.01984195038676262,
 -0.004008124116808176,
 0.010633670724928379,
 -0.017394818365573883,
 0.02810417301952839,
 -0.03090449795126915,
 0.003434183541685343,
 -0.00632596155628562,
 -0.007688282523304224,
 -0.019627509638667107,
 0.009422718547284603,
 -0.016902867704629898,
 0.002308376831933856,
 -0.01330785546451807,
 0.020069003105163574,
 -0.017785854637622833,
 0.007215254474431276,
 0.009649772197008133,
 0.012096903286874294,
 -0

In [13]:
# Establish a cache of embeddings to avoid recomputing - saves time and money
# Cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# Set path to embedding cache
embedding_cache_path = "amazon_reviews_embeddings_cache.pkl"

# Load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    # Return embedding of given string, using a cache to avoid recomputing.
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [18]:
# Example embedding on our dataset
example_string = df["text"].values[0]
print(example_string)

example_embedding = embedding_from_string(example_string)
print(f"\nExample embedding: {example_embedding[:3]}...")

Summary: AMAZING; Review: I bought these because they were the only screen protectors that were less than one dollar when including shipping To be honest i didnt expect much from them I was really surprised when I received them quickly Let me tell you something about these screen protectors THEY ARE AWESOME Ok i only had them for a few days so i dont know how durable they are but right now the are amazing I got the matte one and i really does reduce fingerprints like a lot Applying the screen protector was easy If dust gets on it just use scotch tape to get it off IT REALLY DOES WORK SURPRISING RIGHTAnyways this was a good purchase

Example embedding: [-0.007201330736279488, 0.006615251302719116, -0.0055175661109387875]...


In [16]:
#TODO
'''
- Use cosine similarity to find distance between embedding vectors
- Using the above to obtain these distances in a sorted descending order, obtaining most similar reviews.
- Print top recommended reviews using K nearest neighbor (for string similarity)
'''

'\n- Use cosine similarity to find distance between embedding vectors\n- Using the above to obtain these distances in a sorted descending order, obtaining most similar reviews.\n- Print top recommended reviews using K nearest neighbor (for string similarity)\n'

In [28]:
def distances_from_embeddings(query_embedding: list, embeddings: list) -> list:
    """Return the distance between query and each embedding in our embeddings"""
    def cosine_similarity(embedding1, embedding2):
        return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    return [cosine_similarity(query_embedding, embedding) for embedding in embeddings]

In [29]:
def indices_of_closest_matches_from_distances(distance: list) -> list:
    return (sorted(range(len(distances)), key=lambda i: distances[i]))[::-1]

In [30]:
def print_recommendations_from_strings(
    strings: list[str],
    index_of_source_string: int,
    k_nearest_neighbors: int = 1,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in strings]
    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]
    # get distances between the source embedding and other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    
    indices_of_nearest_neighbors = indices_of_closest_matches_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k articles
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        String: {strings[i]}
        Distance: {distances[i]:0.3f}"""
        )

    return indices_of_nearest_neighbors

In [31]:
review_no = 0

print("Summary:", df.iloc[review_no]["summary"])
print("review:", df.iloc[review_no]["reviewText"])

Summary: AMAZING
review: I bought these because they were the only screen protectors that were less than one dollar when including shipping To be honest i didnt expect much from them I was really surprised when I received them quickly Let me tell you something about these screen protectors THEY ARE AWESOME Ok i only had them for a few days so i dont know how durable they are but right now the are amazing I got the matte one and i really does reduce fingerprints like a lot Applying the screen protector was easy If dust gets on it just use scotch tape to get it off IT REALLY DOES WORK SURPRISING RIGHTAnyways this was a good purchase


In [32]:
df["text"].values[review_no]

'Summary: AMAZING; Review: I bought these because they were the only screen protectors that were less than one dollar when including shipping To be honest i didnt expect much from them I was really surprised when I received them quickly Let me tell you something about these screen protectors THEY ARE AWESOME Ok i only had them for a few days so i dont know how durable they are but right now the are amazing I got the matte one and i really does reduce fingerprints like a lot Applying the screen protector was easy If dust gets on it just use scotch tape to get it off IT REALLY DOES WORK SURPRISING RIGHTAnyways this was a good purchase'

In [33]:
descriptions = df["text"].values
print_recommendations_from_strings(descriptions, review_no, k_nearest_neighbors=10)

NameError: name 'distances' is not defined