In [None]:
%%capture
%pip install openai
%pip install langchain-openai


In [1]:
from openai import OpenAI

# Amazon Data

## Data Loader

In [14]:
## Dataset format
"""FeaturesDict({
    'author': string,
    'body': string,
    'content': string,
    'id': string,
    'normalizedBody': string,
    'subreddit': string,
    'subreddit_id': string,
    'summary': string,
})"""

"FeaturesDict({\n    'author': string,\n    'body': string,\n    'content': string,\n    'id': string,\n    'normalizedBody': string,\n    'subreddit': string,\n    'subreddit_id': string,\n    'summary': string,\n})"

In [17]:
# Files downloaded from http://snap.stanford.edu/data/web-Amazon-links.html
reviews_file = 'Watches'

In [19]:
import json

def parse_file_to_json_grouped_by_product_id(file_path):
    products = {}
    
    with open(file_path, 'r') as file:
        current_entry = {}
        for line in file:
            if line.strip():
                key, value = line.split(': ', 1)
                # Adjusting for keys that contain '/'
                formatted_key = key.replace('/', '_')
                current_entry[formatted_key] = value.strip()
            else:
                if current_entry:
                    product_id = current_entry['product_productId']
                    if product_id in products:
                        products[product_id].append(current_entry)
                    else:
                        products[product_id] = [current_entry]
                    current_entry = {}
                if current_entry:
            product_id = current_entry['product_productId']
            if product_id in products:
                products[product_id].append(current_entry)
            else:
                products[product_id] = [current_entry]
    
    # Convert the dictionary to JSON
    json_data = json.dumps(products, indent=4)
    
    return json_data

file_path = reviews_file + '.txt'
json_output = parse_file_to_json_grouped_by_product_id(file_path)
#print(json_output['B000NLZ4A2'])

with open(reviews_file + '_grouped.json', 'w') as json_file:
    json_file.write(json_output)


In [20]:
def load_products_from_json(file_path):
    with open(file_path, 'r') as file:
        products = json.load(file)
    return products

products = load_products_from_json(reviews_file + '_grouped.json')

In [21]:
review_counts = {}

# iterate through product IDs and count for max
for product_id, reviews in products.items():
    review_counts[product_id] = len(reviews)
max_reviews_product_id = max(review_counts, key=review_counts.get)
max_reviews_count = review_counts[max_reviews_product_id]

print(f'Most Reviews: {max_reviews_product_id} with {max_reviews_count} reviews')

Most Reviews: B0006AAS4M with 685 reviews


In [24]:
import random
# print random review
max_reviews = products[max_reviews_product_id]
print(json.dumps(max_reviews[random.randint(0, len(max_reviews))], indent=4, sort_keys=True))

{
    "product_price": "79.85",
    "product_productId": "B0006AAS4M",
    "product_title": "Invicta Men's 8926 Pro Diver Collection Automatic Watch",
    "review_helpfulness": "0/1",
    "review_profileName": "Bubba",
    "review_score": "2.0",
    "review_summary": "What customer service.",
    "review_text": "I emailed Invicta for advice in adjusting the watchband and was ignored. I bought the watch to use while snorkling and I got the automatic because I do not think the quartz watches are much different among brands except for the case and therefore not worth the premium price. I am sorry I bought an Invicta because the total lack of customer service.",
    "review_time": "1360540800",
    "review_userId": "A1U6FVN74QJ341"
}


### Finally getting list of Reviews
May be able to use other data such as helpfulness, etc.

In [35]:
def extract_reviews_most_freq(products, most_freq_id):
    reviews = []
    for entry in products[most_freq_id]:
        review_text = entry.get("review_text", "")
        if review_text:
            reviews.append(review_text)
    
    return reviews

reviews = extract_reviews_most_freq(products, max_reviews_product_id)
#print random review
print(reviews[random.randint(0, len(reviews))])
reviews_stringified = str(reviews[0:10]) # test with 10 reviews for now

I first heard about this watch on the Poor Man's Watch site. I found it on Amazon.com. I then performed a product review by reading these types of reviews and then I ordered the watch. It arrived in less than seven days. there were no S/H fees or taxes. I paid $79.00 which included a beautiful box, cleaning cloth, warranty and instruction. By the way the box alone is probably worth the money I paid for the watch and yes on some sites, it is advertisted and you are given the impression that it is a separate cost item that does not come with the watch. The price on this site was the lowest price I've found for this watch.Now here is the good part: This watch is an extraordinary value for this price or higher, (much higher). The quality of the bracelet and case is wonderful. the watch has a Japanese automatic movement not a Swiss movement and has been keeping very good time since I've had it (less than a 30 second gain in a week) with constant wear and the sweep second hand is fluid but n

## Review Preprocessing

In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
def extract_keywords(response, keyword_limit=10):
    #remove stopwords
    tokens = word_tokenize(response)
    stop_words = set(stopwords.words('english'))
    filter_tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word.isalnum()]

    # focus on frequent words
    freq_dist = FreqDist(filter_tokens) #also gather word frequencies
    keywords = [word for word, freq in freq_dist.most_common(keyword_limit)]
    return keywords

## LLM Interpreter

In [37]:
system_message = 'You are a sentiment analyzer. Your missions is to read through a list of amazon reviews for a specific product and summarize the key aspect of significant reviews. It is crucial the you weigh the importance of the review by how meaningful they are and how relevant the review is to the product. You are to look at one review at a time and evalute the significance of each. Then, from your evalutation, combine these reviews based on the weights you found. This resulting compilation should be a list of good and bad parts of all the reviews overall. Please print responses in markdown.'



In [38]:

import langchain_openai
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

In [39]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", "{input}")
])

In [40]:
chain = prompt | llm 

In [41]:
request = reviews_stringified
chain.invoke({"input": request})

AIMessage(content='### Positive Aspects:\n1. **Quality and Design:** \n   - Multiple reviewers praised the beautiful design and quality of the watch, especially noting the upgraded movement with 24 jewels and engine-turned rotor.\n   - The automatic movement was well-received for its accuracy and value for the price.\n   - The watch received compliments for its appearance and versatility for different occasions.\n\n2. **Value for Money:**\n   - Reviewers appreciated the affordable price for the quality of the watch, stating it was the best watch they had purchased in terms of value.\n   - The fast shipping and good delivery service were also positively mentioned.\n\n3. **Durability and Performance:**\n   - The watch was noted for its durability and ability to keep accurate time even after being exposed to water and various tasks.\n   - The self-winding mechanism was appreciated, and the watch was reported to keep time well.\n\n### Negative Aspects:\n1. **Accuracy Issues:**\n   - One re