# REVIEW SUMMARIZAR
## TRIPADVISOR: HOTELS

*   Esteban Ariza
*   Johan Giraldo
*   Mateo Valdes

## Prerequisites

In [4]:
#%pip install transformers
#%pip install torch
#%pip install sentencepiece

In [5]:
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import pandas as pd
import csv

In [6]:
COLUMNS_NAME = ['ORIGINAL_TEXT', 'SUMMARIZED_TEXT']

In [7]:
try:
    writer = csv.DictWriter(open('summarized_reviews.csv', 'w', encoding='UTF8', newline=''), fieldnames=COLUMNS_NAME, delimiter=',', lineterminator='\r')
    writer.writeheader()
except IOError:
    print("I/O error")

In [8]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
def summarize(review):
    tokenized_text = tokenizer.encode('summarize: ' + review, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=30,
                                    max_length=100,
                                    early_stopping=True)
    row = {}
    row[COLUMNS_NAME[0]] = review
    row[COLUMNS_NAME[1]] = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    try:
        writer.writerow(row)
    except IOError:
                print("I/O error")
    print('Summarized: ' + row[COLUMNS_NAME[0]] + ' to: ' + row[COLUMNS_NAME[1]])

In [10]:
data = pd.read_csv('tripadvisor_hotels_sustainables_v2.csv')
data['REVIEW_TEXT'].apply(summarize)

TypeError: can only concatenate str (not "dict") to str