# Import Data

Import the around 100.000 reviews of the most reviewed amazon products

In [10]:
import pandas as pd

amazon_df = pd.read_csv('../data/amazon_sorted/most_populair_products.csv')
amazon_df = amazon_df.drop(columns='index').dropna().reset_index()


Take the most reviewed product

In [11]:
product_df = amazon_df[amazon_df['product_id'] == amazon_df.iloc[0]['product_id']]
print(len(product_df))

8799


# Topic Modeling

Generate a topic model, containing the 10 most important topics of the reviews. \
The topic model ```fit_transform``` function returns predictions, which are two lists: the first is a list of predicted topic per review, the second is a list of topic probability distributions per review. 

In [12]:
from bertopic import BERTopic

nr_topics = 10
topic_model = BERTopic(nr_topics=nr_topics, calculate_probabilities=True)

predictions = topic_model.fit_transform(product_df['review_body'])
print(len(predictions[1]))
print(topic_model.get_topic_info())

8799
    Topic  Count                           Name
0      -1   6038               -1_the_to_and_is
1       0    531        0_sound_easy_great_good
2       1    328  1_wireless_the_headphones_and
3       2    326            2_it_works_this_was
4       3    320            3_hearing_he_for_my
5       4    294                 4_the_to_tv_it
6       5    225            5_the_range_is_good
7       6    210             6_the_rs120_to_and
8       7    189               7_tv_watch_to_my
9       8    189               8_tv_the_and_can
10      9    149            9_head_they_off_the


In [17]:
print((topic_model.get_representative_docs()[3]))
print(topic_model.transform(topic_model.get_representative_docs()[4]))

([4, 4, 4, 4, 4, 4, 4, 4, 4], array([[4.91686497e-21, 3.36651807e-21, 3.92007461e-21, 3.90391301e-21,
        9.99999999e-01, 1.93956031e-21, 1.37972835e-21, 2.46451028e-21,
        3.70640189e-21, 5.74249369e-22],
       [4.71774979e-02, 3.21554577e-02, 3.75492545e-02, 3.74333981e-02,
        3.66036750e-01, 1.85467634e-02, 1.30397106e-02, 2.35903865e-02,
        3.55075806e-02, 5.47057395e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        9.99999999e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [8.75933342e-05, 5.07389297e-05, 1.04429993e-04, 6.39363631e-05,
        8.25576562e-01, 2.05937664e-05, 1.00284895e-05, 1.77810271e-05,
        2.32044744e-05, 6.68770634e-06],
       [2.69231223e-13, 1.56000422e-13, 3.21461618e-13, 1.96714747e-13,
        9.35967213e-01, 6.32315447e-14, 3.08093807e-14, 5.46390034e-14,
        7.13052538e-14, 2.05409653e-14],
       [0.00000000e+00, 0.00000000e+00, 0.000

In [14]:
## TODO Try topic modeling per sentence instead of per review

# Ranking

For all the reviews, take the average sentiment value of the review

In [15]:
## This is only needed once
# import nltk

# nltk.download()

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
reviews = product_df['review_body'].to_list()
compounds = list()
for review in reviews:
    compounds.append(sia.polarity_scores(review)['compound'] + 1)

Take the average sentiment value of each topic

In [16]:
sents = [0]*nr_topics
cum_weights = [0]*nr_topics
topic_model_info = topic_model.get_topic_info().drop([0]).reset_index()
for index, compound in enumerate(compounds):
    for topic, row in topic_model_info.iterrows():
        weight = predictions[1][index][topic]
        sents[topic] += weight*compound
        cum_weights[topic] += weight
average_sents = [0]*nr_topics
for i in range(nr_topics):
    average_sents[i] = sents[i] / cum_weights[i]

## average_sents is now a value between 0 and 2 for each topic

[756.1321090567033, 511.6332144563165, 543.2662668539549, 548.4678688851308, 243.84466015925722, 298.2141129570286, 227.2045497818215, 306.21864006143727, 445.87074947620835, 133.25638921140825]
[485.9275532114037, 323.46324071801484, 394.35791634886056, 354.96863762192135, 173.98700611099204, 187.3688032197755, 148.67480777751254, 191.6105828144392, 286.56074147196097, 102.6906799796142]
[1.5560593427138851, 1.581735264015185, 1.3775969603545772, 1.5451164152403412, 1.4015107542209255, 1.5915889296001766, 1.5281980395887003, 1.5981301009766646, 1.555938008765361, 1.2976483283377016]


Calculate the $L_2$ norm of each word vector of each review, then calculate the average per review

Rank all the reviews according to the topic relevance, average sentiment value and information estimation

# Summarize

Summarize the reranked 300 reviews of the most reviewed amazon product

# Measure Results

Estimate the information value of the summary with the next word predictor

Calculate the relevance of the summary to the product category

Count the most common English words