# Import Data

Import the around 100.000 reviews of the most reviewed amazon products

In [1]:
import pandas as pd

amazon_df = pd.read_csv('../data/amazon_sorted/most_populair_products.csv')
amazon_df = amazon_df.drop(columns='index').dropna().reset_index()

Take the most reviewed product

In [2]:
product_df = amazon_df[amazon_df['product_id'] == amazon_df.iloc[0]['product_id']]
print(len(product_df))

8799


# Topic Modeling

Generate a topic model, containing the 10 most important topics of the reviews. \
The topic model ```fit_transform``` function returns predictions, which are two lists: the first is a list of predicted topic per review, the second is a list of topic probability distributions per review. 

In [3]:
from bertopic import BERTopic

nr_topics = 10
topic_model = BERTopic(nr_topics=nr_topics, calculate_probabilities=True)

predictions = topic_model.fit_transform(product_df['review_body'])
print(topic_model.get_topic_info())

    Topic  Count                           Name
0      -1   6544               -1_the_to_and_is
1       0    488       0_easy_sound_great_works
2       1    250         1_hearing_he_these_and
3       2    247               2_tv_watch_my_to
4       3    225                3_tv_it_the_can
5       4    221             4_the_to_rs120_and
6       5    196  5_wireless_the_headphones_and
7       6    182         6_the_reception_is_and
8       7    154            7_head_they_the_off
9       8    151            8_static_the_and_of
10      9    141   9_product_good_abc_excellent


In [4]:
## TODO Try topic modeling per sentence instead of per review

# Ranking

For all the reviews, take the average sentiment value of the review

In [5]:
## This is only needed once
# import nltk

# nltk.download()

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
reviews = product_df['review_body'].to_list()
compounds = list()
for review in reviews:
    compounds.append(sia.polarity_scores(review)['compound'] + 1)

Take the average sentiment value of each topic, and calculate the deviation of each review from each topic

In [6]:
import numpy as np

sents = [0]*nr_topics
cum_weights = [0]*nr_topics
topic_model_info = topic_model.get_topic_info().drop([0]).reset_index()
for index, compound in enumerate(compounds):
    for topic, row in topic_model_info.iterrows():
        weight = predictions[1][index][topic]
        sents[topic] += weight*compound
        cum_weights[topic] += weight
average_sents = [0]*nr_topics
for i in range(nr_topics):
    average_sents[i] = sents[i] / cum_weights[i]

## average_sents is now a value between 0 and 2 for each topic

deviations = list()
for index, review in enumerate(reviews):
    deviation_per_topic = list()
    for topic in range(nr_topics):
        deviation_per_topic.append(np.abs(compounds[index] - average_sents[topic]))
    deviations.append(deviation_per_topic)

Calculate the $L_2$ norm of each word vector of each review, then calculate the average per review

In [7]:
import numpy as np
import gensim.downloader

w2v = gensim.downloader.load('word2vec-google-news-300')


In [8]:
avg_norms = list()
for review in reviews:
    total_norm = 0
    valid_words = 0
    for word in review.split():
        try:
            total_norm += np.linalg.norm(w2v[word])
            valid_words += 1
        except KeyError:
            continue
    try:
        avg_norm = total_norm / valid_words
        avg_norms.append(avg_norm)
    except ZeroDivisionError:
        avg_norms.append(1)

Rank all the reviews according to the topic relevance, average sentiment value and information estimation

In [9]:
rankings = list()
alpha = 0.1
beta = 0.05
topic_relevance = predictions[1]
for topic in range(nr_topics):
    ranking = pd.DataFrame(data={'review': reviews, 'relevance': np.transpose(topic_relevance)[topic], 'sentiment_deviation': np.transpose(deviations)[topic], 'information': avg_norms})
    ranking['score'] = ranking['relevance'] + alpha*ranking['sentiment_deviation'] + beta*ranking['information']
    ranking = ranking.sort_values(by=['score'], ascending=False)
    ranking.reset_index()
    rankings.append(ranking)

# Summarize

Summarize the reranked 100 reviews of each topic of the most reviewed amazon product

In [10]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [11]:
from os import truncate
from transformers import pipeline

summarizer = pipeline("summarization", model="sshleifer/textsum-cnn-12-6", truncation=True)
topic_summaries = ''
for ranking in rankings:
    first_reviews = ranking.head(100)['review']
    full_text = ''
    for review in first_reviews:
        full_text += '\n' + review
    topic_summaries += '\n' + summarizer(full_text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
final_summary = summarizer(topic_summaries, max_length=130, min_length=30, do_sample=False)[0]['summary_text']

In [12]:
from src.helpers.serialization import df_to_json

topsum_path = '../data/topsum_summaries.json'

df_to_json(pd.DataFrame(data={'text': [final_summary]}), path=topsum_path)

# Measure Results

Estimate the information value of the summary with the next word predictor

In [13]:
from src.metrics.information_estimator import test_summaries

test_summaries(topsum_path)

{'correct': 0, 'total': 50, 'ratio': 0.0}

Calculate the relevance of the summary to the product category

In [14]:
from src.metrics.relevance_calculator import calculate_relevance

calculate_relevance(topsum_path)

5.161898877885607

Count the most common English words

In [15]:
from src.metrics.word_frequency import count_words

count_words(topsum_path)

'2824193.94'