# REVIEW SUMMARIZER
## TRIPADVISOR: HOTELS

*   Esteban Ariza
*   Johan Giraldo
*   Mateo Valdes

## Prerequisites

In [None]:
%pip install transformers
%pip install torch
%pip install sentencepiece
%pip install rouge-score
%pip install evaluate

In [1]:
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import pandas as pd
import csv
from rouge_score import rouge_scorer, scoring
import evaluate
from nltk.translate.bleu_score import sentence_bleu

## Normal

Read csv

In [None]:
INPUT_CSV_PATH = "../data/exploratory_analysis/tripadvisor_hotels_clean.csv"
HOTEL_DATA = pd.read_csv(INPUT_CSV_PATH)

Download models

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Write csv file

In [None]:
COLUMNS_NAME = ['ORIGINAL_TEXT', 'SUMMARIZED_TEXT']

In [None]:
try:
    writer = csv.DictWriter(open('summarized_reviews.csv', 'w', encoding='UTF8', newline=''), fieldnames=COLUMNS_NAME, delimiter=',', lineterminator='\r')
    writer.writeheader()
except IOError:
    print("I/O error")

In [None]:
def summarize(review):
    tokenized_text = tokenizer.encode('summarize: ' + review, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=30,
                                    max_length=100,
                                    early_stopping=True)
    row = {}
    row[COLUMNS_NAME[0]] = review
    row[COLUMNS_NAME[1]] = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    try:
        writer.writerow(row)
    except IOError:
                print("I/O error")
    print('Summarized: ' + row[COLUMNS_NAME[0]] + ' to: ' + row[COLUMNS_NAME[1]])

In [None]:
HOTEL_DATA['REVIEW_TEXT'].apply(summarize)

In [None]:
HOTEL_SUMMARY = pd.read_csv('summarized_reviews.csv')

ROUGE

In [None]:
rouge = evaluate.load('rouge')
predictions = HOTEL_SUMMARY[COLUMNS_NAME[1]].tolist()
references = HOTEL_SUMMARY[COLUMNS_NAME[0]].tolist()
results = rouge.compute(predictions=predictions, references=references)
print(results)

BLEU

In [None]:
def splitter(value):
    return value.split()

reference = list(map(splitter, HOTEL_SUMMARY[COLUMNS_NAME[0]].tolist()))
candidates = list(map(splitter, HOTEL_SUMMARY[COLUMNS_NAME[1]].tolist()))

# 1-gram:
def bleu(reference, candidates, weights=(0.25, 0.25, 0.25, 0.25)):
    result = 0;
    for candidate in candidates:
        result += sentence_bleu(reference, candidate, weights=weights)
    result = result / len(candidates)
    return result

print('BLEU: %f' %bleu(reference, candidates))
print('BLEU 1-gram: %f' %bleu(reference, candidates, (1, 0, 0, 0)))
print('BLEU 2-gram: %f' %bleu(reference, candidates, (0, 1, 0, 0)))
print('BLEU 3-gram: %f' %bleu(reference, candidates, (0, 0, 1, 0)))
print('BLEU 4-gram: %f' %bleu(reference, candidates, (0, 0, 0, 1)))

## By Hotel and Year

First, we import the csv file which contains all the reviews from all the hotels

In [2]:
INPUT_CSV_PATH = "../data/exploratory_analysis/tripadvisor_hotels_clean.csv"
HOTEL_DATA = pd.read_csv(INPUT_CSV_PATH)

We also need to import the T5 model (Is recommended to use 't5-base')

In [3]:
PRETRAINED_MODEL = 't5-small'

model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Then, we create a file writer to save all the summarized reviews at run time

In [4]:
COLUMNS_NAME = ['HOTEL_NAME','REVIEW_DATE','REVIEW_TEXT','REVIEW_SUMMARY']
OUTPUT_CSV_PATH = "../data/review_summarizer/summarized_reviews_by_year_and_hotel.csv"

try:
    writer = csv.DictWriter(open(OUTPUT_CSV_PATH, 'w', encoding='UTF8', newline=''), fieldnames=COLUMNS_NAME, delimiter=',', lineterminator='\r')
    writer.writeheader()
except IOError:
    print("I/O error")

The "concatReviewsByYearAndHotel" method will help us to group all the reviews by year and hotel name

In [5]:
REVIEW_CONCATCHAR1 = "{review concat}"

def fromDateToYear(value): #Clean CSV (yyyy-mm-dd)
    return value.split("-")[0]

def concatReviewsByYearAndHotel(df):
    df = df.copy()
    df["REVIEW_DATE"] = df["REVIEW_DATE"].map(fromDateToYear)
    df['REVIEW_TEXT'] = df[['HOTEL_NAME','REVIEW_TEXT','REVIEW_DATE']].groupby(["HOTEL_NAME","REVIEW_DATE"])["REVIEW_TEXT"].transform(lambda x: REVIEW_CONCATCHAR1.join(x))
    return df[['HOTEL_NAME','REVIEW_DATE','REVIEW_TEXT']].drop_duplicates()

The "summarizeByYearHotel" method is the one that uses the T5 model to summarize each review. After that, the method will save each summary in the output csv with the writer

In [10]:
REVIEW_CONCATCHAR2 = "."

def summarizeEachAndConcat(reviews):
    if len(reviews) <= 1:
        return REVIEW_CONCATCHAR2.join(reviews)
    else:
        summary_prefix = 'summarize'
        inputs = tokenizer([f'{summary_prefix}: ' + sequence for sequence in reviews], return_tensors="pt", padding=True)
        output_sequences = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )
        output = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
        return REVIEW_CONCATCHAR2.join(output)


def summarizeByYearHotel(actRow):
    summary_prefix = 'summarize'
    tokenized_text = tokenizer.encode(f'{summary_prefix}: ' + actRow["REVIEW_TEXT"], return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=30,
                                    max_length=100,
                                    early_stopping=True)
    row = {}
    row[COLUMNS_NAME[0]] = actRow["HOTEL_NAME"]
    row[COLUMNS_NAME[1]] = actRow["REVIEW_DATE"]
    row[COLUMNS_NAME[2]] = actRow["REVIEW_TEXT"]
    row[COLUMNS_NAME[3]] = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    model.deparallelize()
    try:
        writer.writerow(row)
    except IOError:
                print("I/O error")
    print(row[COLUMNS_NAME[0]] + '-' + row[COLUMNS_NAME[1]] + ' -> len: '+ str( len(row[COLUMNS_NAME[2]]) ) )

Here, we use the two above methods to summarize by year and hotel

In [11]:
# Group by NAME and YEAR
HOTEL_DATA_BY_YEARHOTEL = concatReviewsByYearAndHotel(HOTEL_DATA)

# Sort by text-len
HOTEL_DATA_BY_YEARHOTEL["REVIEW_TEXT_LENGTH"] = HOTEL_DATA_BY_YEARHOTEL["REVIEW_TEXT"].map(len)
HOTEL_DATA_BY_YEARHOTEL = HOTEL_DATA_BY_YEARHOTEL.sort_values(by = ['REVIEW_TEXT_LENGTH'], ascending = False)

# For each row summarize by all summaries
for index, row in HOTEL_DATA_BY_YEARHOTEL.iterrows():
    row["REVIEW_TEXT"] = summarizeEachAndConcat(row["REVIEW_TEXT"].split(REVIEW_CONCATCHAR1))
    summarizeByYearHotel(row)

Corpo Santo Lisbon Historical Hotel-2022 -> len: 30120
Kimpton De Witt Amsterdam-2019 -> len: 23372
Corinthia Budapest-2019 -> len: 24385
Grandior Hotel Prague-2019 -> len: 24450


KeyboardInterrupt: 

When the summarizing is done, we can import the generated csv into a new dataframe

In [None]:
HOTEL_SUMMARY = pd.read_csv(OUTPUT_CSV_PATH)

The ROGUE value of the summaries is

In [None]:
rouge = evaluate.load('rouge')
predictions = HOTEL_SUMMARY[COLUMNS_NAME[1]].tolist()
references = HOTEL_SUMMARY[COLUMNS_NAME[0]].tolist()
results = rouge.compute(predictions=predictions, references=references)
print(results)

And the BLEU value of the summaries is

In [None]:
def splitter(value):
    return value.split()

reference = list(map(splitter, HOTEL_SUMMARY[COLUMNS_NAME[0]].tolist()))
candidates = list(map(splitter, HOTEL_SUMMARY[COLUMNS_NAME[1]].tolist()))

# 1-gram:
def bleu(reference, candidates, weights=(0.25, 0.25, 0.25, 0.25)):
    result = 0;
    for candidate in candidates:
        result += sentence_bleu(reference, candidate, weights=weights)
    result = result / len(candidates)
    return result

print('BLEU: %f' %bleu(reference, candidates))
print('BLEU 1-gram: %f' %bleu(reference, candidates, (1, 0, 0, 0)))
print('BLEU 2-gram: %f' %bleu(reference, candidates, (0, 1, 0, 0)))
print('BLEU 3-gram: %f' %bleu(reference, candidates, (0, 0, 1, 0)))
print('BLEU 4-gram: %f' %bleu(reference, candidates, (0, 0, 0, 1)))