In [1]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# CD to correct folder 
!cd "/content/drive/MyDrive/Colab_Notebooks/Thesis/"

# Initialize path
import sys
sys.path.append('/content/drive/MyDrive/Colab_Notebooks/Thesis/')

In [3]:
%%capture
!pip install bert-extractive-summarizer
!pip install sacremoses

!pip install -U sentence-transformers

#!pip install spacy
!pip install transformers # > 4.0.0
#!pip install neuralcoref

#!python -m spacy download en_core_web_md
#!python -m nltk.downloader('stopwords')

In [4]:
%%capture
import pandas as pd
import numpy as np
import re
from pprint import pprint

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler

from scipy.spatial import distance

from summarizer import Summarizer
from summarizer.sbert import SBertSummarizer

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [5]:
# ---------- LOAD DATA ----------
import pickle
from pprint import pprint

# Define path
path = "/content/drive/MyDrive/Colab_Notebooks/Thesis/Data/"

# Save cleaned data in dataframe
train_data = pd.read_csv(path + 'train_data_cleaned.csv').rename(columns={"Unnamed: 0": "Index"})
test_data = pd.read_csv(path + 'test_data_cleaned.csv').rename(columns={"Unnamed: 0": "Index"})
X_train = np.array(pd.read_csv(path + 'X_train.csv').loc[:,'0'])
X_val = np.array(pd.read_csv(path + 'X_val.csv').loc[:,'0'])
y_train = np.array(pd.read_csv(path + 'y_train.csv').loc[:,'0'])
y_val = np.array(pd.read_csv(path + 'y_val.csv').loc[:,'0'])

# Preview of data
test_data.head(3)

Unnamed: 0,Index,Index.1,Aspect,Rating,Text,Venue,SentenceCount,Sentiment,Label
0,0,22175,Staff,4.5,Very cheerful and happy to talk. The cafe was ...,the battle of bannockburn visitor centre stirling,2,positive,1
1,12,307,Overview,4.5,The entry to King Koby is glass double doors t...,king koby leeds,10,positive,1
2,13,22475,Overview,5.0,Staff were friendly and helpful. Not easy to f...,paradise island adventure golf stretford,6,positive,1


In [6]:
# Set-up GPU for training
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


# Create input for summaries

In [8]:
summary_df = pd.DataFrame(columns=["Venue", "Aspect", "Sentiment", "Input", "Summary"])
temp_df = pd.DataFrame(columns=["Venue", "Aspect", "Sentiment", "Input", "Summary"])
venue_lst = []
aspect_lst = []
sent_lst = []
input_lst = []

all_data = pd.concat([train_data, test_data])

# Count nr. of sentences in the input and set a threshold of 5 sentences
for v in all_data["Venue"].unique():
    venue_df = all_data[all_data["Venue"]==v]
    for a in venue_df["Aspect"].unique():
        aspect_df = venue_df[venue_df["Aspect"]==a]
        for sent in aspect_df["Sentiment"].unique():
            sent_df = aspect_df[aspect_df["Sentiment"]==sent]
            input = ' '.join(sent_df.Text).replace('\n', '').replace('.', '. ').replace('  ', ' ')
            if len(sent_tokenize(input)) > 10:
                venue_lst.append(v)
                aspect_lst.append(a)
                sent_lst.append(sent)
                input_lst.append(input)

summary_df["Venue"] = venue_lst
summary_df["Aspect"] = aspect_lst
summary_df["Sentiment"] = sent_lst
summary_df["Input"] = input_lst

summary_df.shape[0]

1978

In [9]:
input_summaries_df = summary_df

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/input_summary_all_data.xlsx')

input_summaries_df.to_excel(writer, sheet_name="input_text")

writer.save()

1978

# Load input for summaries

In [5]:
# Define path
path = "/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/"

# Save data in dataframe
summary_df = pd.read_excel(path + 'input_summary_all_data.xlsx').rename(columns={"Unnamed: 0": "Index"})
summary_df.head()

Unnamed: 0,Index,Venue,Aspect,Sentiment,Input,Summary
0,0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,
1,1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,
2,2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,
3,3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,
4,4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,


In [None]:
summary_df.shape[0]

24203

In [None]:
summary_df[:-1]

Unnamed: 0,Index,Venue,Aspect,Sentiment,Input,Summary
0,0,ikea glasgow,Toilets,positive,"Again, very spacious. The only issue is that t...",
1,1,ikea glasgow,Access,positive,"Very spacious Lift, the store itself is very s...",
2,2,ikea glasgow,Transport & Parking,positive,Fantastic! Plenty of spaces with lots of room ...,
3,3,ikea glasgow,Staff,positive,Always willing to help and assist. My only con...,
4,4,ikea glasgow,Overview,positive,Amazing access from the moment you leave the c...,
...,...,...,...,...,...,...
24197,24197,lush glasgow,Access,positive,The store is bright and certainly colourful. G...,
24198,24198,the leadburn inn west linton,Overview,positive,This accessible venue has it’s own car park an...,
24199,24199,queen victoria hospital morecambe,Overview,positive,Flat and Level access although there is a slig...,
24200,24200,woodstock woodstock,Transport & Parking,negative,The main car park involves a walk to the old q...,


# Functions

In [6]:
# Summary Evaluation

# Creating a dictionary and corpus
def create_corpus(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Isolate and remove punctuations
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    - Create dictionary
    - Create corpus
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations
    s = re.sub(r'[^\w\s]','',s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    tokens = word_tokenize(s)

    dictionary = Dictionary([tokens])

    corpus = [dictionary.doc2bow(text) for text in [tokens]]

    return dictionary, corpus

# Evaluate with Distance Metrics
def distance_metrics(input, output):
    stop_words = stopwords.words('english')
    dictionary, corpus = create_corpus(input)

    np.random.seed(1)

    model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, minimum_probability=1e-8)

    input_lst = [word for word in word_tokenize(input) if not word.lower() in stop_words]
    output_lst = [word for word in word_tokenize(output) if not word.lower() in stop_words]

    # now let's make these into a bag of words format
    bow_input = model.id2word.doc2bow(input_lst)
    bow_output = model.id2word.doc2bow(output_lst)

    # we can now get the LDA topic distributions for these
    lda_input = model[bow_input]
    lda_output = model[bow_output]

    kull = kullback_leibler(lda_input, lda_output)

    jens = distance.jensenshannon(lda_input, lda_output)

    return kull, jens[1]

In [24]:
distance_metrics("Ample parking spaces and space.", "There is ample parking")

(0.0011497699, 0.016923191014731418)

# Machine Learning Model: Gensim

In [10]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
from gensim.summarization.textcleaner import get_sentences, split_sentences

# Create summaies with gensim
output_lst = []

for input in summary_df["Input"]:
    nr_words = (len(word_tokenize(input)))
 
    try:
        output = summarize(input, word_count=50).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

summary_df["Summary"] = output_lst

In [25]:
# Evaluate gensim summaries
input_lst = summary_df["Input"]
output_lst = summary_df["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)

summary_df["kullback_leibler"] = kull_lst
summary_df["jensen_shannon"] = jens_lst

In [26]:
gensim_summaries = pd.DataFrame(summary_df)

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/gensim_summaries_all_data.xlsx')

gensim_summaries.to_excel(writer, sheet_name="gensim_summaries_train_data")

writer.save()

In [27]:
gensim_summaries

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary,kullback_leibler,jensen_shannon
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,There is ample parking including disabled park...,8.352299e-03,0.049038
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,They had to use lift to 2nd and 3rd floor From...,2.754421e-02,0.078816
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,The staff and volunteers at Leuchie did all th...,2.318858e-02,0.077067
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,All the staff and volunteers were very helpful...,1.355034e-02,0.064499
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,On the lower ground level there is a second to...,1.682215e-02,0.070738
...,...,...,...,...,...,...,...
1973,thistle flat seafront apartment anstruther,Overview,positive,I had a wonderful five days based in Thistle F...,It's also easy enough to wheel around the harb...,2.488326e-02,0.087402
1974,firstontario concert hall hamilton,Overview,positive,The Debaters: A Canadian Comedy Company. Our n...,Texted by Blueline cabs from 5 mins before arr...,3.315472e-08,0.000042
1975,huntingdon hall worcester,Overview,positive,Went on Sunday to see Gyles Brandreth at Hunti...,Unfortunately 3 people decided to come up in t...,2.514483e-02,0.089991
1976,truro cathedral truro,Overview,positive,"Truro is a beautiful, small, historic city. Th...","Truro is a beautiful, small, historic city. Th...",1.916886e-02,0.069657


# BERT

In [7]:
# Initialize models
model_bert = Summarizer()
model_distilbert = Summarizer('distilbert-base-uncased', hidden=[-1,-2], hidden_concat=True)
model_sbert = SBertSummarizer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

## BERT


In [29]:
bert_summaries = summary_df[["Venue", "Aspect", "Sentiment", "Input"]]
#bert_summaries1 = bert_summaries[:12101]
#bert_summaries2 = bert_summaries[12101:]
bert_summaries.head()

Unnamed: 0,Venue,Aspect,Sentiment,Input
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...


In [30]:
# Create summaies with BERT
output_lst = []

for input in bert_summaries["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_bert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

bert_summaries["Summary"] = output_lst

# Evaluate summaries
input_lst = bert_summaries["Input"]
output_lst = bert_summaries["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)
bert_summaries["kullback_leibler"] = kull_lst
bert_summaries["jensen_shannon"] = jens_lst

# Save generate summaries in between
bert_summaries_all_data = bert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/bert_summaries_all_data.xlsx')

bert_summaries_all_data.to_excel(writer, sheet_name="bert_summaries_all_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [32]:
# Evaluate summaries
input_lst = bert_summaries["Input"]
output_lst = bert_summaries["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)
bert_summaries["kullback_leibler"] = kull_lst
bert_summaries["jensen_shannon"] = jens_lst

# Save generate summaries in between
bert_summaries_all_data = bert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/bert_summaries_all_data.xlsx')

bert_summaries_all_data.to_excel(writer, sheet_name="bert_summaries_all_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
bert_summaries.head()

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary,kullback_leibler,jensen_shannon
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,It is a beautiful country house set out in a s...,0.034647,0.103083
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,Although Leuchie House is on four levels there...,0.034553,0.095786
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,Within 24 hours they knew my individual needs....,0.125981,0.163815
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,All the staff and volunteers were very helpful...,0.017156,0.072951
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,All loos are accessible with plenty of room fo...,0.008131,0.048372


### Split summaries

In [None]:
# Create summaies with BERT
output_lst = []

for input in bert_summaries1["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_bert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

bert_summaries1["Summary"] = output_lst

# Evaluate summaries
input_lst = summary_df["Input"]
output_lst = summary_df["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)

summary_df["kullback_leibler"] = kull_lst
summary_df["jensen_shannon"] = jens_lst

# Save generate summaries in between
bert_summaries_all_data = bert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/bert_summaries_all_data.xlsx')

bert_summaries_all_data.to_excel(writer, sheet_name="bert_summaries_all_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
# Create summaies with BERT
output_lst = []

for input in bert_summaries2["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_bert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

bert_summaries2["Summary"] = output_lst

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
# Evaluate summaries
input_lst = summary_df["Input"]
output_lst = summary_df["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)

summary_df["kullback_leibler"] = kull_lst
summary_df["jensen_shannon"] = jens_lst

# Save generate summaries in between
bert_summaries_all_data = bert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/bert_summaries_all_data.xlsx')

bert_summaries_all_data.to_excel(writer, sheet_name="bert_summaries_all_data")

writer.save()

## DISTILBERT

In [8]:
# DISTIL BERT
distilbert_summaries = summary_df[["Venue", "Aspect", "Sentiment", "Input"]]
#distilbert_summaries1 = distilbert_summaries[:12101]
#distilbert_summaries2 = distilbert_summaries[12101:]
distilbert_summaries.head()

Unnamed: 0,Venue,Aspect,Sentiment,Input
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...


In [9]:
# Create summaies with DISTILBERT
output_lst = []

for input in distilbert_summaries["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_distilbert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

distilbert_summaries["Summary"] = output_lst

# Evaluate summaries
input_lst = distilbert_summaries["Input"]
output_lst = distilbert_summaries["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)
        
distilbert_summaries["kullback_leibler"] = kull_lst
distilbert_summaries["jensen_shannon"] = jens_lst

# Save generate summaries in between
distilbert_summaries_train_data = distilbert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/distilbert_summaries_all_data.xlsx')

distilbert_summaries_train_data.to_excel(writer, sheet_name="distilbert_summaries_all_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [16]:
distilbert_summaries

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary,kullback_leibler,jensen_shannon
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,It is a beautiful country house set out in a s...,0.016643,0.070319
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,Although Leuchie House is on four levels there...,0.000893,0.014832
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,Within 24 hours they knew my individual needs....,0.035467,0.095433
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,All the staff and volunteers were very helpful...,0.008952,0.051927
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,All loos are accessible with plenty of room fo...,0.009618,0.052809
...,...,...,...,...,...,...,...
1973,thistle flat seafront apartment anstruther,Overview,positive,I had a wonderful five days based in Thistle F...,I had a wonderful five days based in Thistle F...,0.022864,0.083630
1974,firstontario concert hall hamilton,Overview,positive,The Debaters: A Canadian Comedy Company. Our n...,Controlled rear ramp entry taxi to downtown Ha...,0.009081,0.049782
1975,huntingdon hall worcester,Overview,positive,Went on Sunday to see Gyles Brandreth at Hunti...,Went on Sunday to see Gyles Brandreth at Hunti...,0.014394,0.067398
1976,truro cathedral truro,Overview,positive,"Truro is a beautiful, small, historic city. Th...","Truro is a beautiful, small, historic city. T...",0.231740,0.218172


### Split summaries

In [None]:
# Evaluate gensim summaries
input_lst = distilbert_summaries["Input"]
output_lst = distilbert_summaries["Summary"]

hell_lst = []
kull_lst = []

for i in range(len(input_lst)):
    try:
        hell, kull = distance_metrics(input_lst[i], output_lst[i])
    except:
        hell_lst.append(-1)
        kull_lst.append(-1)
    else:
        hell_lst.append(hell)
        kull_lst.append(kull)

distilbert_summaries["hellinger"] = hell_lst
distilbert_summaries["kullback_leibler"] = kull_lst

# Save generate summaries in between
distilbert_summaries_train_data = distilbert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/distilbert_summaries_all_data.xlsx')

distilbert_summaries_train_data.to_excel(writer, sheet_name="distilbert_summaries_all_data")

writer.save()

In [None]:
# Create summaies with DISTILBERT
output_lst = []

for input in distilbert_summaries1["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_distilbert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

distilbert_summaries1["Summary"] = output_lst


# Save generate summaries in between
distilbert_summaries1_train_data = distilbert_summaries1

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/distilbert_summaries1_train_data.xlsx')

distilbert_summaries1_train_data.to_excel(writer, sheet_name="distilbert_summaries1_train_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
# Create summaies with DISTILBERT
output_lst = []

for input in distilbert_summaries2["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_distilbert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

distilbert_summaries2["Summary"] = output_lst

# Save generate summaries in between
distilbert_summaries2_train_data = distilbert_summaries2

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/distilbert_summaries2_train_data.xlsx')

distilbert_summaries2_train_data.to_excel(writer, sheet_name="distilbert_summaries2_train_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
distilbert_summaries = pd.concat([distilbert_summaries1, distilbert_summaries2])

# Evaluate gensim summaries
input_lst = distilbert_summaries["Input"]
output_lst = distilbert_summaries["Summary"]

hell_lst = []
kull_lst = []

for i in range(len(input_lst)):
    try:
        hell, kull = distance_metrics(input_lst[i], output_lst[i])
    except:
        hell_lst.append(-1)
        kull_lst.append(-1)
    else:
        hell_lst.append(hell)
        kull_lst.append(kull)

distilbert_summaries["hellinger"] = hell_lst
distilbert_summaries["kullback_leibler"] = kull_lst

In [None]:
# Save generate summaries in between
distilbert_summaries_train_data = distilbert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/distilbert_summaries_train_data.xlsx')

distilbert_summaries_train_data.to_excel(writer, sheet_name="distilbert_summaries_train_data")

writer.save()

## SBERT

In [12]:
# DISTIL BERT
sbert_summaries = summary_df[["Venue", "Aspect", "Sentiment", "Input"]]
#sbert_summaries1 = sbert_summaries[:12101]
#sbert_summaries2 = sbert_summaries[12101:]
sbert_summaries.head()

Unnamed: 0,Venue,Aspect,Sentiment,Input
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...


In [13]:
# Create summaies with SBERT
output_lst = []

for input in sbert_summaries["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_sbert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

sbert_summaries["Summary"] = output_lst

# Evaluate summaries
input_lst = sbert_summaries["Input"]
output_lst = sbert_summaries["Summary"]

kull_lst = []
jens_lst = []

for i in range(len(input_lst)):
    try:
        kull, jens = distance_metrics(input_lst[i], output_lst[i])
    except:
        kull, jens = -1, -1
        kull_lst.append(kull)
        jens_lst.append(jens)
    else:
        kull_lst.append(kull)
        jens_lst.append(jens)
        
sbert_summaries["kullback_leibler"] = kull_lst
sbert_summaries["jensen_shannon"] = jens_lst

# Save generate summaries in between
sbert_summaries_all_data = sbert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/sbert_summaries_all_data.xlsx')

sbert_summaries_all_data.to_excel(writer, sheet_name="sbert_summaries_all_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [15]:
sbert_summaries

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary,kullback_leibler,jensen_shannon
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,It is a beautiful country house set out in a s...,0.005228,0.038403
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,Although Leuchie House is on four levels there...,0.010998,0.050921
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,Within 24 hours they knew my individual needs....,0.175037,0.210967
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,All the staff and volunteers were very helpful...,0.012314,0.061353
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,All loos are accessible with plenty of room fo...,0.008131,0.048372
...,...,...,...,...,...,...,...
1973,thistle flat seafront apartment anstruther,Overview,positive,I had a wonderful five days based in Thistle F...,I had a wonderful five days based in Thistle F...,0.007440,0.046496
1974,firstontario concert hall hamilton,Overview,positive,The Debaters: A Canadian Comedy Company. Our n...,Controlled rear ramp entry taxi to downtown Ha...,0.009081,0.049782
1975,huntingdon hall worcester,Overview,positive,Went on Sunday to see Gyles Brandreth at Hunti...,Went on Sunday to see Gyles Brandreth at Hunti...,0.013818,0.065980
1976,truro cathedral truro,Overview,positive,"Truro is a beautiful, small, historic city. Th...","Truro is a beautiful, small, historic city. T...",0.042870,0.100845


### Split

In [None]:
# Evaluate gensim summaries
input_lst = sbert_summaries["Input"]
output_lst = sbert_summaries["Summary"]

hell_lst = []
kull_lst = []

for i in range(len(input_lst)):
    try:
        hell, kull = distance_metrics(input_lst[i], output_lst[i])
    except:
        hell_lst.append(-1)
        kull_lst.append(-1)
    else:
        hell_lst.append(hell)
        kull_lst.append(kull)

sbert_summaries["hellinger"] = hell_lst
sbert_summaries["kullback_leibler"] = kull_lst

# Save generate summaries in between
sbert_summaries_all_data = sbert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/sbert_summaries_all_data.xlsx')

sbert_summaries_all_data.to_excel(writer, sheet_name="sbert_summaries_all_data")

writer.save()

In [None]:
# Create summaies with DISTILBERT
output_lst = []

for input in sbert_summaries1["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_sbert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

sbert_summaries1["Summary"] = output_lst

# Save generate summaries in between
sbert_summaries1_train_data = sbert_summaries1

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/sbert_summaries1_train_data.xlsx')

sbert_summaries1_train_data.to_excel(writer, sheet_name="sbert_summaries1_train_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
# Create summaies with DISTILBERT
output_lst = []

for input in sbert_summaries2["Input"]:
    nr_sentences = (len(sent_tokenize(input)))
 
    try:
        output = model_sbert(input, num_sentences=3).replace('\n', '').replace('.', '. ')
        if output == '':
            output = input
    except:
        output_lst.append(input)
    else:
        output_lst.append(output)

sbert_summaries2["Summary"] = output_lst

# Save generate summaries in between
sbert_summaries2_train_data = sbert_summaries2

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/sbert_summaries2_train_data.xlsx')

sbert_summaries2_train_data.to_excel(writer, sheet_name="sbert_summaries2_train_data")

writer.save()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
sbert_summaries1.head()

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary
0,ikea glasgow,Toilets,positive,"Again, very spacious. The only issue is that t...",The only issue is that they are at the end of ...
1,ikea glasgow,Access,positive,"Very spacious Lift, the store itself is very s...","Very spacious Lift, the store itself is very s..."
2,ikea glasgow,Transport & Parking,positive,Fantastic! Plenty of spaces with lots of room ...,Plenty of spaces with lots of room to get in a...
3,ikea glasgow,Staff,positive,Always willing to help and assist. My only con...,My only concern is that it can be quite diffic...
4,ikea glasgow,Overview,positive,Amazing access from the moment you leave the c...,Amazing access from the moment you leave the c...


In [None]:
sbert_summaries2.head()

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary
12101,mercure box hill burford bridge hotel dorking,Access,positive,There is ramped access to 2 sets of double doo...,There is ramped access to 2 sets of double doo...
12102,mercure box hill burford bridge hotel dorking,Staff,positive,Always friendly and helpful.,Always friendly and helpful.
12103,mercure box hill burford bridge hotel dorking,Transport & Parking,negative,The only negative about the Burford Bridge is ...,The only negative about the Burford Bridge is ...
12104,turner contemporary margate,Transport & Parking,positive,There are disabled spaces just outside the gal...,There are disabled spaces just outside the gal...
12105,turner contemporary margate,Overview,positive,This is a very accessible venue. The main entr...,The main entrance is up a large flight of step...


In [None]:
sbert_summaries = pd.concat([sbert_summaries1, sbert_summaries2])

# Evaluate gensim summaries
input_lst = sbert_summaries["Input"]
output_lst = sbert_summaries["Summary"]

hell_lst = []
kull_lst = []

for i in range(len(input_lst)):
    try:
        hell, kull = distance_metrics(input_lst[i], output_lst[i])
    except:
        hell_lst.append(-1)
        kull_lst.append(-1)
    else:
        hell_lst.append(hell)
        kull_lst.append(kull)

sbert_summaries["hellinger"] = hell_lst
sbert_summaries["kullback_leibler"] = kull_lst

In [None]:
# Save all
sbert_summaries_train_data = sbert_summaries

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/sbert_summaries_train_data.xlsx')

sbert_summaries_train_data.to_excel(writer, sheet_name="sbert_summaries_train_data")

writer.save()

## Evaluation

In [17]:
bert_summaries

NameError: ignored

In [18]:
distilbert_summaries

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary,kullback_leibler,jensen_shannon
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,It is a beautiful country house set out in a s...,0.016643,0.070319
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,Although Leuchie House is on four levels there...,0.000893,0.014832
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,Within 24 hours they knew my individual needs....,0.035467,0.095433
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,All the staff and volunteers were very helpful...,0.008952,0.051927
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,All loos are accessible with plenty of room fo...,0.009618,0.052809
...,...,...,...,...,...,...,...
1973,thistle flat seafront apartment anstruther,Overview,positive,I had a wonderful five days based in Thistle F...,I had a wonderful five days based in Thistle F...,0.022864,0.083630
1974,firstontario concert hall hamilton,Overview,positive,The Debaters: A Canadian Comedy Company. Our n...,Controlled rear ramp entry taxi to downtown Ha...,0.009081,0.049782
1975,huntingdon hall worcester,Overview,positive,Went on Sunday to see Gyles Brandreth at Hunti...,Went on Sunday to see Gyles Brandreth at Hunti...,0.014394,0.067398
1976,truro cathedral truro,Overview,positive,"Truro is a beautiful, small, historic city. Th...","Truro is a beautiful, small, historic city. T...",0.231740,0.218172


In [19]:
sbert_summaries  

Unnamed: 0,Venue,Aspect,Sentiment,Input,Summary,kullback_leibler,jensen_shannon
0,leuchie house north berwick,Transport & Parking,positive,Ample parking spaces and space. It is a beauti...,It is a beautiful country house set out in a s...,0.005228,0.038403
1,leuchie house north berwick,Access,positive,Disabled access is very good. Although Leuchi...,Although Leuchie House is on four levels there...,0.010998,0.050921
2,leuchie house north berwick,Overview,positive,Grace staff/venue/quality of care. Within 24 h...,Within 24 hours they knew my individual needs....,0.175037,0.210967
3,leuchie house north berwick,Staff,positive,All the staff and volunteers were very helpful...,All the staff and volunteers were very helpful...,0.012314,0.061353
4,leuchie house north berwick,Toilets,positive,All loos are accessible with plenty of room fo...,All loos are accessible with plenty of room fo...,0.008131,0.048372
...,...,...,...,...,...,...,...
1973,thistle flat seafront apartment anstruther,Overview,positive,I had a wonderful five days based in Thistle F...,I had a wonderful five days based in Thistle F...,0.007440,0.046496
1974,firstontario concert hall hamilton,Overview,positive,The Debaters: A Canadian Comedy Company. Our n...,Controlled rear ramp entry taxi to downtown Ha...,0.009081,0.049782
1975,huntingdon hall worcester,Overview,positive,Went on Sunday to see Gyles Brandreth at Hunti...,Went on Sunday to see Gyles Brandreth at Hunti...,0.013818,0.065980
1976,truro cathedral truro,Overview,positive,"Truro is a beautiful, small, historic city. Th...","Truro is a beautiful, small, historic city. T...",0.042870,0.100845
