# Import Data

Import the around 100.000 reviews of the most reviewed amazon products

In [1]:
import pandas as pd

amazon_df = pd.read_csv('../data/amazon_sorted/most_populair_products.csv')
amazon_df = amazon_df.drop(columns='index').dropna().reset_index()

Take the most reviewed product

In [2]:
product_df = amazon_df[amazon_df['product_id'] == amazon_df.iloc[0]['product_id']]
print(len(product_df))

8799


# Topic Modeling

Generate a topic model, containing the 10 most important topics of the reviews. \
The topic model ```fit_transform``` function returns predictions, which are two lists: the first is a list of predicted topic per review, the second is a list of topic probability distributions per review. 

In [3]:
from bertopic import BERTopic

nr_topics = 10
topic_model = BERTopic(nr_topics=nr_topics, calculate_probabilities=True)

predictions = topic_model.fit_transform(product_df['review_body'])
print(len(predictions[1]))
print(topic_model.get_topic_info())

8799
    Topic  Count                           Name
0      -1   5942               -1_the_to_and_is
1       0    490                0_the_and_to_of
2       1    487        1_easy_sound_good_great
3       2    473                 2_tv_he_my_and
4       3    254                 3_to_the_tv_it
5       4    224  4_wireless_the_headphones_and
6       5    220          5_works_great_it_good
7       6    195        6_the_tv_and_headphones
8       7    191              7_these_tv_and_to
9       8    164        8_the_sennheiser_and_is
10      9    159            9_static_the_and_to


In [4]:
print((topic_model.get_representative_docs()[3]))
print(topic_model.transform(topic_model.get_representative_docs()[4]))

["Didn't work with the tv for which it was purchased.", "I purchased this used from Amazon, though the product in the box looked as if it hadn't been opened.<br />It was easy to connect and works well. I use it for watching TV after my spouse is asleep. It's perfect for that, though with my 40&#34; Sony TV, the sound from the TV speakers shuts off when the 3.5mm headphone jack has something plugged into it. This means you need to plug the base connector in when these are in use, and unplug when you want sound from the TV speakers. To get around this I purchased a Fiio D3 (D03K) Digital to Analog Converter (comes with optical cable) here on Amazon and connected the headphone base unit with an optical cable from the optical audio out port on my TV. Now I only need to mute the TV speakers using the mute button on the TV remote and listen to the headphones adjusting their volume with the volume control on the headset.<br />The reception on the headphones work best when the volume level on 

In [5]:
## TODO Try topic modeling per sentence instead of per review

# Ranking

For all the reviews, take the average sentiment value of the review

In [6]:
## This is only needed once
# import nltk

# nltk.download()

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
reviews = product_df['review_body'].to_list()
compounds = list()
for review in reviews:
    compounds.append(sia.polarity_scores(review)['compound'] + 1)

Take the average sentiment value of each topic, and calculate the deviation of each review from each topic

In [7]:
import numpy as np

sents = [0]*nr_topics
cum_weights = [0]*nr_topics
topic_model_info = topic_model.get_topic_info().drop([0]).reset_index()
for index, compound in enumerate(compounds):
    for topic, row in topic_model_info.iterrows():
        weight = predictions[1][index][topic]
        sents[topic] += weight*compound
        cum_weights[topic] += weight
average_sents = [0]*nr_topics
for i in range(nr_topics):
    average_sents[i] = sents[i] / cum_weights[i]

## average_sents is now a value between 0 and 2 for each topic

deviations = list()
for index, review in enumerate(reviews):
    deviation_per_topic = list()
    for topic in range(nr_topics):
        deviation_per_topic.append(np.abs(compounds[index] - average_sents[topic]))
    deviations.append(deviation_per_topic)

Calculate the $L_2$ norm of each word vector of each review, then calculate the average per review

In [8]:
import numpy as np
import gensim.downloader

w2v = gensim.downloader.load('word2vec-google-news-300')


In [9]:
avg_norms = list()
for review in reviews:
    total_norm = 0
    valid_words = 0
    for word in review.split():
        try:
            total_norm += np.linalg.norm(w2v[word])
            valid_words += 1
        except KeyError:
            continue
    try:
        avg_norm = total_norm / valid_words
        avg_norms.append(avg_norm)
    except ZeroDivisionError:
        avg_norms.append(1)

Rank all the reviews according to the topic relevance, average sentiment value and information estimation

In [10]:
rankings = list()
alpha = 0.1
beta = 0.05
topic_relevance = predictions[1]
for topic in range(nr_topics):
    ranking = pd.DataFrame(data={'review': reviews, 'relevance': np.transpose(topic_relevance)[topic], 'sentiment_deviation': np.transpose(deviations)[topic], 'information': avg_norms})
    ranking['score'] = ranking['relevance'] + alpha*ranking['sentiment_deviation'] + beta*ranking['information']
    ranking = ranking.sort_values(by=['score'], ascending=False)
    rankings.append(ranking)

# Summarize

Summarize the reranked 100 reviews of each topic of the most reviewed amazon product

In [11]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [12]:
from src.summarization.huggingface_summarizer import summarize_text

print(summarize_text(reviews[0]))

: 

: 

# Measure Results

Estimate the information value of the summary with the next word predictor

Calculate the relevance of the summary to the product category

Count the most common English words