In [None]:
import pandas as pd
%pip install rouge
from rouge import Rouge
import torch
import sklearn
%pip install transformers
import time
import json
import numpy as np
import nltk
%pip install sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
nltk.download('punkt')
%pip install simpletransformers

In [2]:
import logging

In [3]:
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

In [4]:
euans_reviews = pd.read_csv("/content/data/euans_reviews.csv")

In [5]:
google_reviews = pd.read_csv("/content/data/trial labels.csv")

Cleaning Euan's Guide and Google dataframes to prepare for summarisation.

In [6]:
euans_reviews = euans_reviews.drop(columns=['Rating', 'Unnamed: 0'])

In [7]:
google_reviews = google_reviews.drop(columns=['Unnamed: 0'])

In [8]:
google_reviews = google_reviews.dropna(subset=['Aspect Label'])

In [9]:
euans_reviews = euans_reviews[euans_reviews['SentenceCount'] >= 10]

In [10]:
euans_reviews = euans_reviews[:843]

In [11]:
grouped_google_reviews = google_reviews.groupby(['Venue Name', 'Aspect Label', 'Sentiment'])['Text'].apply(' '.join).reset_index()


In [12]:
grouped_euans_reviews = euans_reviews.groupby(['Venue', 'Aspect', 'Sentiment'])['Text'].apply(lambda x: '. '.join(x)).reset_index()

In [13]:
grouped_google_reviews['sentence count'] = grouped_google_reviews['Text'].apply(lambda x: len(nltk.sent_tokenize(x)))
grouped_google_reviews = grouped_google_reviews[grouped_google_reviews['sentence count'] > 1]

In [15]:
grouped_euans_reviews.rename(columns={'Text': 'input_text'}, inplace=True)

In [20]:
display(grouped_euans_reviews['input_text'][:0])

Series([], Name: input_text, dtype: object)

In [17]:
grouped_google_reviews.rename(columns={'Text': 'input_text'}, inplace=True)

In [None]:
display(grouped_google_reviews['input_text'][:7])

Firstly I create summaries with the Text Rank models

In [26]:
def text_rank_summarizer(review):
    tokenizer = Tokenizer("en")
    text_parser = PlaintextParser.from_string(review,tokenizer)
    text_rank_obj = TextRankSummarizer()
    summary = text_rank_obj(text_parser.document, sentences_count=1)
    return summary

In [28]:
grouped_euans_reviews['Text Rank Summaries'] = grouped_euans_reviews['input_text'].apply(lambda x: text_rank_summarizer(x))

In [29]:
grouped_euans_reviews['Text Rank Summaries'] = grouped_euans_reviews['Text Rank Summaries'].apply(lambda x: str(x[0]))

In [36]:
display(grouped_euans_reviews['Text Rank Summaries'][:1])

0    Depending on the type of wheelchair you have a...
Name: Text Rank Summaries, dtype: object

In [31]:
grouped_google_reviews['Text Rank Summaries'] = grouped_google_reviews['input_text'].apply(lambda x: text_rank_summarizer(x))

In [32]:
grouped_google_reviews = grouped_google_reviews[grouped_google_reviews['Text Rank Summaries'].apply(lambda x: isinstance(x, tuple) and len(x) > 0)]

In [33]:
display(grouped_google_reviews['Text Rank Summaries'][:1])

8    (We asked for a place by the window and that w...
Name: Text Rank Summaries, dtype: object

In [34]:
grouped_google_reviews['Text Rank Summaries'] = grouped_google_reviews['Text Rank Summaries'].apply(lambda x: str(x[0]))

In [35]:
display(grouped_google_reviews['input_text'][:4])

8     Did not take the time to serve you properly. W...
10    Why is English spoken all the time, even thoug...
23    Nice pizzas, the owners is an italian who know...
30    Also good for takeaways when you're at uni, hi...
Name: input_text, dtype: object

BERT Setup

In [39]:
from simpletransformers.seq2seq import (
    Seq2SeqModel,
    Seq2SeqArgs,
)
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


In [37]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [40]:
model_args = Seq2SeqArgs()
model_args.num_train_epochs = 10
model_args.no_save = True
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

In [41]:
# Initialize model
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
    use_cuda=True,
)

In [44]:
grouped_google_reviews.rename(columns={'Text Rank Summaries': 'target_text'}, inplace=True)
grouped_euans_reviews.rename(columns={'Text Rank Summaries': 'target_text'}, inplace=True)

In [48]:
pd.set_option('display.max_colwidth', None)

In [52]:
display(grouped_google_reviews['input_text'][:1])

8    Did not take the time to serve you properly. We asked for a place by the window and that was no problem.
Name: input_text, dtype: object

In [53]:
display(grouped_euans_reviews['target_text'][0])

'Depending on the type of wheelchair you have and the recent weather it may be possible to get on the grass amongst some of the ruins.'

In [54]:
input_text = grouped_euans_reviews.input_text.values.tolist()
n = len(input_text)

In [55]:
X_train = grouped_euans_reviews[:round(n*0.8)]
X_val = grouped_euans_reviews[:round(n*0.2)]
print(len(X_train), len(X_val))

640 160


In [None]:
model.train_model(X_train, eval_data=X_val, output_dir='/content/model')

In [None]:
# Evaluate the model
result = model.eval_model(X_val)

In [60]:
print(result)

{'eval_loss': 0.0833960535004735}


In [61]:
google_review_text = grouped_google_reviews.input_text.values.tolist()

In [62]:
grouped_google_reviews['BART Summaries'] = model.predict(google_review_text)

Generating outputs:   0%|          | 0/13 [00:00<?, ?it/s]

In [63]:
google_summaries = grouped_google_reviews['BART Summaries'].values.tolist()

Evaluation

In [67]:
rouge = Rouge()
google_eval = rouge.get_scores(google_summaries, google_review_text, avg=True, ignore_empty=True)

In [68]:
print(google_eval)

{'rouge-1': {'r': 0.4305793224636383, 'p': 0.8206940004440003, 'f': 0.5521754000275468}, 'rouge-2': {'r': 0.3693534394322486, 'p': 0.7568481795981795, 'f': 0.4831863378974397}, 'rouge-l': {'r': 0.4270162834320835, 'p': 0.8136963314463312, 'f': 0.5476364949797173}}


Saving summaries to csv

In [69]:
grouped_google_reviews['BART Summaries'].to_csv('/content/data/BART_google_summaries.csv')
grouped_google_reviews['target_text'].to_csv('/content/data/textrank_google_summaries.csv')

In [71]:
grouped_euans_reviews['target_text'].to_csv('/content/data/textrank_euans_summaries.csv')