In [41]:
import openai
import pandas as pd
import numpy as np
import regex as re
import pickle
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [42]:
df = pd.read_json("data/Amazon_reviews.json", lines=True)

In [43]:
# What are we working with? 
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [44]:
# Remove unnecessary columns & Cleanup 
df = df[["reviewText", "overall", "summary"]].dropna()
df = df.sample(250)

In [45]:
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\&", " and ", text)
    text = re.sub(r"\|", " ", text)
    text = re.sub(r"\s+", " ", text)
    # Eliminate all punctuation
    text = re.sub(r"[^\w\d\s]", "", text)
    return text.strip()

df["reviewText"] = df["reviewText"].apply(clean_text)
df["summary"] = df["summary"].apply(clean_text)

In [46]:
for _, row in df.head(5).iterrows():
    print("Summary:", row["summary"])
    print("Rating:", row["overall"])
    print("Review:", row["reviewText"])
    print()

Summary: Awesome case
Rating: 5
Review: Love this case Sturdy and good for what I needed it to keep my Pikachu 3DS scratch free and still be able to see my Pikachu on the cover

Summary: Just right for the price
Rating: 4
Review: I needed an urgent upgrade for my old cell This phone provides a good performance Updates runs very easily I like the size since it fits fine if my shirt pocket try to do that with the regular Galaxy Its light and the screen resolution is perfect for reading news and basic internet browsing Only problem is trying to use the keyboard in portrait I gave up and just rotate to landscape and its less difficult In general I recommend this phone if you dont want to spend to much money and want to use your previous SIM card

Summary: never worked
Rating: 1
Review: a compete waste of money and expectations too small for apple category products and untrustworthy a completely meaningless item and shouldnt be admitted not amazon servers

Summary: Love it
Rating: 5
Review:

In [47]:
# make new column that appends Summary and review. 
# this will be the input to the model
df["text"] = ("Summary: "+ df["summary"] + "; Review: " + df["reviewText"])

In [48]:
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
client = openai.OpenAI()

def get_embedding(text: str, model: str = EMBEDDING_MODEL):
    return client.embeddings.create(input = [text], model=model).data[0].embedding
get_embedding("Hello world")

[-0.004755867645144463,
 0.004859873093664646,
 -0.016640810295939445,
 -0.02446955442428589,
 -0.017346784472465515,
 0.012631887570023537,
 -0.01919996552169323,
 0.009259602054953575,
 -0.01024292316287756,
 -0.026852216571569443,
 0.02288111299276352,
 0.010287046432495117,
 -0.02353665977716446,
 -0.006599593907594681,
 0.0080619677901268,
 0.0025307899340987206,
 0.024986427277326584,
 -0.012115013785660267,
 0.013173974119126797,
 0.013262221589684486,
 -0.01068415679037571,
 -0.0035487788263708353,
 0.0038261255249381065,
 0.008351922035217285,
 -0.020750585943460464,
 -0.0019288210896775126,
 0.012379754334688187,
 -0.01897304505109787,
 0.030583791434764862,
 -0.03118891268968582,
 0.003621267154812813,
 -0.0077026779763400555,
 -0.006114237010478973,
 -0.017775410786271095,
 0.004922906402498484,
 -0.015682702884078026,
 0.0013567933347076178,
 -0.015632275491952896,
 0.01972944475710392,
 -0.0159978698939085,
 0.007223624270409346,
 0.008219551295042038,
 0.0116296568885445

In [49]:
# Establish a cache of embeddings to avoid recomputing - saves time and money
# Cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# Set path to embedding cache
embedding_cache_path = "amazon_reviews_embeddings_cache.pkl"

# Load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    # Return embedding of given string, using a cache to avoid recomputing.
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [50]:
# Example embedding on our dataset
example_string = df["text"].values[0]
print(example_string)

example_embedding = embedding_from_string(example_string)
print(f"\nExample embedding: {example_embedding[:3]}...")

Summary: Awesome case; Review: Love this case Sturdy and good for what I needed it to keep my Pikachu 3DS scratch free and still be able to see my Pikachu on the cover

Example embedding: [0.02030399814248085, 0.018438013270497322, 0.019142650067806244]...


In [51]:
#TODO
'''
- Use cosine similarity to find distance between embedding vectors
- Using the above to obtain these distances in a sorted descending order, obtaining most similar reviews.
- Print top recommended reviews using K nearest neighbor (for string similarity)
'''

'\n- Use cosine similarity to find distance between embedding vectors\n- Using the above to obtain these distances in a sorted descending order, obtaining most similar reviews.\n- Print top recommended reviews using K nearest neighbor (for string similarity)\n'

In [52]:
def distances_from_embeddings(query_embedding: list, embeddings: list) -> list:
    """Return the distance between query and each embedding in our embeddings"""
    def cosine_similarity(embedding1, embedding2):
        return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    return [cosine_similarity(query_embedding, embedding) for embedding in embeddings]

In [53]:
def indices_of_closest_matches_from_distances(distances: list) -> list:
    return (sorted(range(len(distances)), key=lambda i: distances[i]))[::-1]

In [54]:
def print_recommendations_from_strings(
    strings: list[str],
    index_of_source_string: int,
    k_nearest_neighbors: int = 1,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in strings]
    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]
    # get distances between the source embedding and other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    
    indices_of_nearest_neighbors = indices_of_closest_matches_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k articles
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        String: {strings[i]}
        Distance: {distances[i]:0.3f}"""
        )

    return indices_of_nearest_neighbors

In [55]:
review_no = 0

print("Summary:", df.iloc[review_no]["summary"])
print("review:", df.iloc[review_no]["reviewText"])

Summary: Awesome case
review: Love this case Sturdy and good for what I needed it to keep my Pikachu 3DS scratch free and still be able to see my Pikachu on the cover


In [56]:
df["text"].values[review_no]

'Summary: Awesome case; Review: Love this case Sturdy and good for what I needed it to keep my Pikachu 3DS scratch free and still be able to see my Pikachu on the cover'

In [57]:
descriptions = df["text"].values
print_recommendations_from_strings(descriptions, review_no, k_nearest_neighbors=10)


        --- Recommendation #1 (nearest neighbor 1 of 10) ---
        String: Summary: Beautiful; Review: this case is amazing it adds a certain appearance to my phone and the colors are great it came right on time and i love it
        Distance: 0.896

        --- Recommendation #2 (nearest neighbor 2 of 10) ---
        String: Summary: awesome case; Review: Always love ballistic cases they are great protectingIt gives the phone a better grip feels bigger in my hands and I love that Also like that i can change the color of the little rubber edges
        Distance: 0.896

        --- Recommendation #3 (nearest neighbor 3 of 10) ---
        String: Summary: Great Case; Review: I love this case have it in multiple colors to match look of the day Fits phone perfectly and allows it to be docked without removing case Texture makes phone less slippery and easier to hold
        Distance: 0.895

        --- Recommendation #4 (nearest neighbor 4 of 10) ---
        String: Summary: Excellent; R

[0,
 134,
 224,
 218,
 247,
 89,
 188,
 190,
 182,
 83,
 151,
 41,
 226,
 181,
 240,
 21,
 170,
 39,
 186,
 248,
 222,
 171,
 96,
 205,
 17,
 23,
 3,
 155,
 114,
 112,
 116,
 210,
 132,
 53,
 40,
 234,
 137,
 126,
 233,
 80,
 121,
 192,
 142,
 66,
 88,
 180,
 185,
 168,
 101,
 43,
 145,
 176,
 15,
 8,
 242,
 177,
 51,
 156,
 129,
 104,
 103,
 194,
 108,
 212,
 22,
 207,
 172,
 19,
 100,
 33,
 195,
 105,
 28,
 71,
 236,
 123,
 76,
 36,
 61,
 217,
 68,
 166,
 18,
 20,
 27,
 160,
 141,
 169,
 32,
 113,
 219,
 206,
 78,
 157,
 187,
 56,
 67,
 73,
 150,
 208,
 216,
 5,
 198,
 84,
 98,
 58,
 148,
 90,
 128,
 122,
 11,
 202,
 13,
 14,
 81,
 229,
 133,
 55,
 86,
 57,
 228,
 174,
 95,
 238,
 65,
 230,
 131,
 35,
 10,
 135,
 201,
 125,
 204,
 12,
 49,
 59,
 16,
 179,
 196,
 243,
 221,
 173,
 146,
 232,
 197,
 115,
 130,
 209,
 144,
 164,
 82,
 237,
 94,
 4,
 74,
 245,
 31,
 63,
 72,
 1,
 200,
 159,
 175,
 34,
 211,
 87,
 124,
 91,
 119,
 42,
 138,
 37,
 246,
 184,
 97,
 239,
 235,
 92,
 30,
 25,