## Loading files and Environment

This notebook will be separated out.

In [69]:
#%conda env export > environment.yml

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import sys
import pathlib
import sklearn
import numpy
import nltk
import wordcloud
import gensim
import scipy.stats as stats
from wordcloud import WordCloud
from sklearn.metrics import classification_report
from sklearn.datasets import load_iris
from PIL import Image, ImageDraw, ImageFont
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

In [3]:
# Get current directory
current_dir = os.getcwd()
# Get parent directory
parent_dir = os.path.join(current_dir, '..')
# Append parent directory to sys.path
sys.path.append(parent_dir)
from src import data_cleaning as dc

In [4]:
cwd = pathlib.Path.cwd().parent
training_file_path = cwd.joinpath("datasets/EuansGuideData.xlsx")
test_file_path = cwd.joinpath("datasets/GoogleReviews")
print('path:', training_file_path)
print('path:', test_file_path)

path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/EuansGuideData.xlsx
path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/GoogleReviews


In [5]:
all_file_names = glob.glob(str(test_file_path) + "/*.csv")
google_df = [pd.read_csv(file_name, index_col=None, header=0) for file_name in all_file_names]
test_data = pd.concat(google_df, axis=0, ignore_index=True)

## Cleaning & Preprocessing

In [6]:
training_data = pd.read_excel(training_file_path)
clean_train_df = dc.clean_and_select(training_data, ["Aspect", "Rating", "Review", "Venue"])
clean_test_df = dc.clean_and_select(test_data, ["Name","Review Rate", "Review Text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Text"] = df["Text"].apply(lambda x: x.replace("\n", ' '))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_aspects["Venue"] = selected_aspects["Venue"].apply(lambda x: get_venue_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"] = df["Rating"].apply(lambda x : pick_s

In [7]:
import string
translator = str.maketrans('', '', string.punctuation+'\u2026')

def remove_small_words(sentence):
   
    if len(nltk.word_tokenize(sentence.translate(translator))) >= 5:
       return sentence
    else:
       return "useless"


In [8]:
import re

pattern = r'\(Translated by Google\)|\(Original\)'  # Define the regular expression pattern
clean_test_df['Text'] = clean_test_df['Text'].str.replace(pattern, '', regex=True)

In [9]:
clean_test_df[400:].head()

Unnamed: 0,Name,Sentiment,Text,Label
792,House of Watt,negative,nowadays no coffee workplace but you can only...,0
793,House of Watt,negative,"Nicely dressed, but the prices are high, I ha...",0
794,House of Watt,positive,"Hidden gem; delicious brisket burgers, cozy a...",1
795,House of Watt,positive,House of Watt is close to the Amstel station....,1
796,House of Watt,positive,Super place to watch Ajax (during corona). Go...,1


In [10]:
sample_df = clean_test_df.assign(text=clean_test_df['Text'].apply(nltk.sent_tokenize)).explode('Text').reset_index(drop=True)
sample_df.head()

Unnamed: 0,Name,Sentiment,Text,Label,text
0,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1,"[It was a bit quite when we went in, but don’t..."
1,Ellis,positive,Nice cozy place which serves very tasty burger...,1,[Nice cozy place which serves very tasty burge...
2,Ellis,positive,Really nice place. One of my favourite burger ...,1,"[Really nice place., One of my favourite burge..."
3,Ellis,negative,The Service was quite good but the burgers we ...,0,[The Service was quite good but the burgers we...
4,Ellis,positive,I had a very nice experience! The staff were r...,1,"[I had a very nice experience!, The staff were..."


In [11]:
sample_df = sample_df.explode('text').reset_index(drop=True)
sample_df.head()

Unnamed: 0,Name,Sentiment,Text,Label,text
0,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1,"It was a bit quite when we went in, but don’t ..."
1,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1,After a little while the place was cozily busy.
2,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1,Rightfully so!
3,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1,"The burgers (and nachos) were lovely, as was t..."
4,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1,I would definitely recommend this place if you...


In [12]:
sample_df = sample_df.drop('Text', axis=1)
sample_df.head()

Unnamed: 0,Name,Sentiment,Label,text
0,Ellis,positive,1,"It was a bit quite when we went in, but don’t ..."
1,Ellis,positive,1,After a little while the place was cozily busy.
2,Ellis,positive,1,Rightfully so!
3,Ellis,positive,1,"The burgers (and nachos) were lovely, as was t..."
4,Ellis,positive,1,I would definitely recommend this place if you...


In [13]:
sample_df['text'] = sample_df['text'].apply(remove_small_words)

In [14]:
sample_df.head()

Unnamed: 0,Name,Sentiment,Label,text
0,Ellis,positive,1,"It was a bit quite when we went in, but don’t ..."
1,Ellis,positive,1,After a little while the place was cozily busy.
2,Ellis,positive,1,useless
3,Ellis,positive,1,"The burgers (and nachos) were lovely, as was t..."
4,Ellis,positive,1,I would definitely recommend this place if you...


In [15]:
sample_df = sample_df[sample_df['text'].str.contains("useless") == False]

In [None]:
reg_pattern = r".*([Pp]ancakes|[Dd]rink|[Dd]esserts|[Gg]in|[Ww]ine|[Bb]reakfast|[Ll]unch|[Pp]asta|[Vv]egeterian|[Vv]egan|[Bb]urgers|[Pp]asta|[Dd]ish|[Bb]eer|[Pp]izza|[Tt]aste|[Ff]ood|[Cc]ocktail|[Cc]offee|[Mm]enu|[Tt]asty|[Dd]elicious|[Ss]taff|[Hh]ost|[Aa]mbience|[Aa]tmosphere|[Cc]o[sz]y|[Gg]ezellig|[Ss]ervice]|[Pp]rice[y]|[Cc]heap|([Nn]ice|[Gg]reat|[Aa]mazing) place|([Gg]ood|[Bb]ad|[Tt]errible|[Gg]reat) experience).*"

mask = sample_df['text'].str.contains(reg_pattern)

In [17]:
sample_df = sample_df[~mask]

In [18]:
sample_df.head()

Unnamed: 0,Name,Sentiment,Label,text
0,Ellis,positive,1,"It was a bit quite when we went in, but don’t ..."
1,Ellis,positive,1,After a little while the place was cozily busy.
4,Ellis,positive,1,I would definitely recommend this place if you...
8,Ellis,positive,1,The best thing is you can even get a gluten-fr...
11,Ellis,positive,1,One of my favourite burger joints whole visiti...


In [30]:
# from nltk.corpus import wordnet as wn

# # define a function to check if a word is related to food or drink
# def is_food_or_drink(word):
#     food_synsets = wn.synsets('food')
#     drink_synsets = wn.synsets('drink')
#     food_and_drink_words = set()
#     for synset in food_synsets + drink_synsets:
#         for lemma in synset.lemmas():
#             food_and_drink_words.add(lemma.name())
#     other_food_and_drink_words = {'breakfast', 'lunch', 'dinner', 'alcohol'}
#     food_and_drink_words |= other_food_and_drink_words
#     return word in food_and_drink_words

# # identify the rows that contain food or drink related words
# food_rows = []
# for i, row in sample_df.iterrows():
#     for word in nltk.word_tokenize(row['text']):
#         if is_food_or_drink(word):
#             food_rows.append(i)

# # drop the rows that contain food or drink related words
# sample_df.drop(food_rows, inplace=True)

In [65]:
google_random_sample = sample_df.sample(frac=0.02, random_state=42)

In [56]:
print((google_random_sample).shape[0])

2764


In [66]:
google_random_sample.to_excel('/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/labelling data/improved2.xlsx')

In [51]:
sample_df.to_excel('/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/11random_google_reviews_excel.xlsx')

In [42]:
# Assuming your dataframe is named df
search_terms = ['Parking', 'Transport', 'Toilets', 'Access', 'Entrance', 'Accessibility', 'Wheelchair', 'Staff']
num_rows = sample_df[sample_df['text'].str.contains('|'.join(search_terms), case=False, na=False)].shape[0]
print(f"There are {num_rows} rows that mention the search terms.")


There are 1937 rows that mention the search terms.


In [None]:
clean_test_df.head()

## Exploratory Data Analysis

In [None]:
wordcloud = WordCloud(background_color='white', max_words=1000, contour_width=3, contour_color='steeleblue')

clustered_reviews_train = ','.join(list(clean_train_df['Text'].values))
clustered_reviews_test = ','.join(list(clean_test_df['Text'].values))
wordcloud.generate(clustered_reviews_train)
wordcloud.generate(clustered_reviews_test)
wordcloud.to_image()
wordcloud.to_image()

In [None]:
amount_per_aspect = clean_train_df.groupby(['Aspect']).count()
amount_per_aspect = amount_per_aspect['Text']
amount_per_aspect.plot(kind='bar', title="Overview of Aspects in Euan's Guide data", ylabel='Amount of aspects', xlabel='Aspect Types', figsize=(6,5))

In [None]:
amount_per_aspect

In [None]:
amount_per_sentiment = clean_train_df.groupby(['Sentiment']).count()
amount_per_sentiment = amount_per_sentiment['Text']
amount_per_sentiment.plot(kind='bar', title="Overview of Sentiments in Euan's Guide data", ylabel='Amount of Each Sentiment', xlabel='Sentiment Types', figsize=(6,5))

In [None]:
amount_per_sentiment 

## Aspect Classification

### Pipeline setup

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
vectorizer = CountVectorizer()
n = round(len(clean_train_df))
euans_reviews = clean_train_df.Text.values.tolist()
google_reviews = clean_test_df[:n].Text.values.tolist()
euans_labels = clean_train_df.Aspect.values.tolist()


In [None]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', VotingClassifier([
        ('nb', MultinomialNB()),
        ('lr', LogisticRegression())
    ]))
])

In [None]:
parameters = {
    'vectorizer__max_df': (0.5, 0.75, 1.0),
    'vectorizer__ngram_range': ((1, 1), (1, 2)),
    'clf__voting': ('soft', 'hard'),
    'clf__nb__alpha': (0.5, 1),
    'clf__lr__C': (0.1, 1, 10),
}

In [None]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=5, n_jobs=5, verbose=1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(euans_reviews, euans_labels, test_size=0.2, random_state=42)

In [None]:
import random

google_sample = random.sample(google_reviews, len(y_val))

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
y_pred = grid_search.predict(X_val)
print(len(y_pred))

In [None]:
google_pred = grid_search.predict(google_sample)

In [None]:
pos_labels = ['Toilets', 'Transport & Parking']

### Evaluation Metrics

In [None]:
valuation_report = classification_report(y_val, y_pred, labels=pos_labels)
evaluation_report = classification_report(y_val, google_pred, labels=pos_labels)
print("Euan's Guide Evaluation Report\n",valuation_report)
print("Google Reviews Evaluation Report\n",evaluation_report)
# save report as a text file
with open('../Results/google_aspect_classification_report.txt', 'w') as f:
    f.write(evaluation_report)
    
with open('../Results/euans_aspect_classification_report.txt', 'w') as f:
    f.write(valuation_report)

# # convert text file to PNG image
img = Image.new('RGB', (800, 800), color='white')
font = ImageFont.truetype('../media/Fonts/Roboto/Roboto-Black.ttf', 20)
draw = ImageDraw.Draw(img)

with open('../Results/google_aspect_classification_report.txt', 'r') as f:
    y = 0
    for line in f.readlines():
        draw.text((10, y), line, fill='black', font=font)
        y += 20

img.save('../Results/google_aspect_classification_report.png')

with open('../Results/euans_aspect_classification_report.txt', 'r') as f:
    y = 0
    for line in f.readlines():
        draw.text((10, y), line, fill='black', font=font)
        y += 20

img.save('../Results/euans_aspect_classification_report.png')


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_val, google_pred)

display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pos_labels)

display.plot()

## Sentiment Analysis

In [None]:
sentiment_labels = clean_train_df.Sentiment.values.tolist()
X2_train, X2_val, y2_train, y2_val = train_test_split(euans_reviews, sentiment_labels, test_size=0.2, random_state=42)

pipeline2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', VotingClassifier([
        ('nb', MultinomialNB()),
        ('svm', SVC())
    ]))
])

In [None]:
sentiment_params = {
    'vectorizer__max_features': [1000, 5000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'clf__voting': ['hard', 'soft'],
    'clf__weights': [[0.5, 0.5], [0.7, 0.3]],
    'clf__estimators': [
        [('nb', MultinomialNB(alpha=0.5)), ('svm', SVC(kernel='linear', C=1.0))],
        [('nb', MultinomialNB(alpha=1.0)), ('svm', SVC(kernel='rbf', C=10.0, gamma=0.1))]
    ]
}

In [None]:
grid_search2 = GridSearchCV(estimator=pipeline2, param_grid=sentiment_params, cv=5, n_jobs=5, verbose=1)
grid_search2.fit(X2_train, y2_train)
y2_pred = grid_search2.predict(X2_val)
google2_pred = grid_search2.predict(google_sample)
euans_report = classification_report(y2_val, y2_pred)
google_report = classification_report(google2_pred, y2_pred)

print('Euans Report\n', euans_report)
print('Google Report\n', google_report)

In [None]:
# save report as a text file
with open('../Results/google_sentiment_analysis_report.txt', 'w') as f:
    f.write(google_report)
    
with open('../Results/euans_sentiment_analysis_report.txt', 'w') as f:
    f.write(euans_report)

# # convert text file to PNG image optimise this as you repeat this code.
img = Image.new('RGB', (800, 800), color='white')
font = ImageFont.truetype('../media/Fonts/Roboto/Roboto-Black.ttf', 20)
draw = ImageDraw.Draw(img)

with open('../Results/euans_sentiment_analysis_report.txt', 'r') as f:
    y = 0
    for line in f.readlines():
        draw.text((10, y), line, fill='black', font=font)
        y += 20

img.save('../Results/euans_sentiment_analysis_report.png')

with open('../Results/google_sentiment_analysis_report.txt', 'r') as f:
    y = 0
    for line in f.readlines():
        draw.text((10, y), line, fill='black', font=font)
        y += 20

img.save('../Results/google_sentiment_analysis_report.png')


In [None]:
cm2 = confusion_matrix(y2_val, google2_pred)

display2 = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=pos_labels)

display2.plot()

## Opinion Summarisation

Moved this part to Google Colab https://colab.research.google.com/drive/1NVzQ3vS6oaQ7EPFzzij1XbjDQT0QpOBO?usp=sharing

In [None]:
#clean_test_df.to_csv('/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/google_reviews.csv', index=False)
#clean_train_df.to_csv("/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/euans_reviews.csv", index=False)

In [None]:
# from transformers import pipeline
# from textblob import TextBlob
# summariser = pipeline('summarization', model='distilbert-base-uncased')

# """
# We want the review text per venue, aspect and sentiment.
# """
# # Possibly vader could look at identifying the sentiment correctly.
# # [['Venue', 'Aspect', 'Sentiment', 'Text']]


# summaries = summariser(euans_reviews, max_length=50, min_length=10)

# reviews_per_venue = clean_train_df['summary'] = [summary['Summarised Review'] for summary in summaries]
# reviews_per_venue.head()


# # Group all reviews per venue
# # generate a summary of the sentiment and aspects for that venue.

## Lex Rank implementation

In [None]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

long_reviews = clean_train_df[clean_train_df['SentenceCount'] > 1]
long_reviews


In [None]:
summaries = []
i = 0
for review in long_reviews.Text.values.tolist():
    parser = PlaintextParser.from_string(review,Tokenizer('english'))
    i+=1
    lex_rank_summarizer = LexRankSummarizer()
    summaries.append(lex_rank_summarizer(parser.document, sentences_count=1)) 
print(i)

In [None]:

def find_empty_summary(summary):
    if len(summary) == 0:
        return "summary processing error"
    else:
        return str(summary[0])


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
long_reviews['Lex Rank Summary'] = summaries
long_reviews['Lex Rank Summary'] = long_reviews['Lex Rank Summary'].apply(lambda x: find_empty_summary(x))


In [None]:
long_reviews['Lex Rank Summary']

## Latent Semantic Analysis Approach

In [None]:
from sumy.summarizers.lsa import LsaSummarizer

lsa_summarizer=LsaSummarizer()
lsa_summaries = []
for review in long_reviews.Text.values.tolist():
    lsa_parser=PlaintextParser.from_string(review,Tokenizer('english'))
    lsa_summaries.append(lsa_summarizer(lsa_parser.document,1))

In [None]:
long_reviews['LSA Summaries'] = lsa_summaries
long_reviews['LSA Summaries'] = long_reviews['LSA Summaries'].apply(lambda x: find_empty_summary(x))

In [None]:
long_reviews.head()

In [None]:
lex_rank_summaries = long_reviews['Lex Rank Summary'].values.tolist()


In [None]:
lsa_summaries = long_reviews['LSA Summaries'].values.tolist()

In [None]:
lex_processed = [dc.preprocess(review) for review in lex_rank_summaries]

lsa_processed = [dc.preprocess(review) for review in lsa_summaries]

In [None]:

topic_dict = gensim.corpora.Dictionary(lsa_processed)
count = 0
for k, v in topic_dict.iteritems():
    print(k, v)
    count +=1

In [None]:
topic_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
lsa_corpus = [topic_dict.doc2bow(doc) for doc in lsa_processed]

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(lsa_corpus)
lsa_corpus_tfidf = tfidf[lsa_corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(lsa_corpus, num_topics=10, id2word=topic_dict, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(lsa_corpus_tfidf, num_topics=10, id2word=topic_dict, passes=2, workers=4)

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Predict on google reviews

In [None]:
clean_test_df['Sentence Count'] = clean_test_df['Text'].apply(lambda x: dc.count_sentences(x))
long_google_reviews = clean_test_df[clean_test_df['Sentence Count'] > 1]
long_google_reviews = long_google_reviews[:len(y_val)]
long_google_reviews.head()

In [None]:
def create_lsa_summary(review):
    google_lsa_parser=PlaintextParser.from_string(review,Tokenizer('english'))
    return lsa_summarizer(google_lsa_parser.document,1)

In [None]:
long_google_reviews['LSA Summary'] = long_google_reviews['Text'].apply(lambda x: create_lsa_summary(x))


In [None]:
long_google_reviews['LSA Summary'] = long_google_reviews['LSA Summary'].apply(lambda x: find_empty_summary(x))

In [None]:
google_lsa = long_google_reviews['LSA Summary'].values.tolist()

In [None]:
print(len(google_lsa))

In [None]:
google_lsa_processed = [dc.preprocess(review) for review in google_lsa]

In [None]:
google_dict = gensim.corpora.Dictionary(google_lsa_processed)

In [None]:
google_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
google_bow_corpus = [google_dict.doc2bow(doc) for doc in google_lsa_processed]

In [None]:
for index, score in sorted(lda_model[google_bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
for index, score in sorted(lda_model_tfidf[google_bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis_train = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=lsa_corpus, dictionary=topic_dict)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_train)

In [None]:
vis_test = pyLDAvis.gensim.prepare(topic_model=lda_model_tfidf, corpus=google_bow_corpus, dictionary=google_dict)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_test)

1. Do a bag of words model or something else
2. Do this for each summary and normal text
3. Generate a topic distribution with LDA on both generated summary and normal text
4. Evaluate with Kullback-Leibler and Jensen-Shannon divergence or cosine similarity

5. create ground truth from BERT model 
6. compare this to the LSA and Lex Rank summaries with cosine similarity.

## Evaluate Lex and LSA summaries for the google reviews

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
count_vect = CountVectorizer(stop_words="english")
not_summarised = long_google_reviews.Text.values.tolist()
cosine_scores = []
for review_a, review_b in zip(google_lsa, not_summarised):

    lsa_goole_vec = vectorizer.fit_transform([review_a])
    normal_google_vec = vectorizer.transform([review_b])
    cosine_sim = cosine_similarity(lsa_goole_vec, normal_google_vec)[0][0]
    cosine_scores.append(cosine_sim)

In [None]:
print(cosine_scores)

In [None]:
length_differences = []
for i in range(len(not_summarised)):
    length_difference = len(not_summarised[i]) - len(google_lsa[i])
    length_differences.append(length_difference)

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.scatter(cosine_scores, length_differences)
fig = plt.figure(num='Summarisation stats')

# Add labels and title
plt.xlabel('Cosine Similarity')
plt.ylabel('Difference in Length')
plt.title('Relationship between Cosine Similarity and Difference in Length')


In [None]:
plt.show()