In [1]:
!git clone https://github.com/nlp-with-transformers/notebooks.git
%cd notebooks
from install import *
install_requirements(is_chapter6=True)

Cloning into 'notebooks'...
remote: Enumerating objects: 502, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 502 (delta 98), reused 93 (delta 86), pack-reused 378[K
Receiving objects: 100% (502/502), 29.34 MiB | 18.17 MiB/s, done.
Resolving deltas: 100% (234/234), done.
/content/notebooks
⏳ Installing base requirements ...
✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


In [34]:
#hide_output
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
with open('/content/news.txt') as f:
    contents = f.read()
sample_text = contents

In [45]:
summaries = {}

## Define Evaluation method

In [None]:
!pip install -U sentence-transformers

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [35]:
def cosine_similarity(a, b):
    nominator = np.dot(a, b)
    
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [41]:
def evaluate(paragraph, summary):
  # Get sentence embedding for paragraph
  paragraph_embeddings = model.encode(sent_tokenize(sample_text))

  # Get sentence embedding for summary
  summary_embeddings = model.encode(sent_tokenize(summaries["pegasus"]))

  result = np.array([])
  # loop through summary sentences and find arg max cosine similarity
  for summary in summary_embeddings:
    max = 0
    for paragraph in paragraph_embeddings:
      similarity = cosine_similarity(summary, paragraph) 
      if  similarity > max:
        max = similarity
    result = np.append(result, max)
    
  # return the mean
  return np.mean(result)

## Pegasus

In [3]:
#hide_output
from transformers import pipeline, set_seed
#hide_output

pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [46]:
pipe_out = pipe(sample_text)

summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

summaries["pegasus"]

'The Chrysler Building has been sold for $150 million, a source says.\nAbu Dhabi investment fund Mubadala bought the building for $800 million in 2008.\nThe building is famous for its triangle-shaped, vaulted windows worked into the stylized crown .'

In [42]:
evaluate( sample_text, summaries["pegasus"])

0.822669247786204

## BART

In [49]:
#hide_output
from transformers import pipeline, set_seed
#hide_output

pipe = pipeline("summarization", model="facebook/bart-large-cnn")

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [50]:
pipe_out = pipe(sample_text)

summaries["bart"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

summaries["bart"]

'Mubadala, an Abu Dhabi investment fund, purchased 90% of the building for $800 million in 2008. The incentive to sell the building at such a huge loss was due to the soaring rent the owners pay to Cooper Union, a New York college, for the land under the building.'

In [51]:
evaluate( sample_text, summaries["bart"])

0.8108186920483907