## Download a Summarizer from HuggingFace

In [1]:
import numpy as np
!pip install transformers --quiet

from transformers import pipeline
model_name = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=model_name)
summarizer_name = summarizer.model.config.__getattribute__('_name_or_path')
print(summarizer_name)
import warnings
warnings.filterwarnings('ignore')

[K     |████████████████████████████████| 5.8 MB 4.9 MB/s 
[K     |████████████████████████████████| 7.6 MB 51.9 MB/s 
[K     |████████████████████████████████| 182 kB 62.3 MB/s 
[?25h

Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

sshleifer/distilbart-cnn-12-6


In [2]:
with open('mexico.txt', 'r') as fp:
  input_data = fp.read()

In [3]:
summ = summarizer(input_data[0:2500], min_length=100, max_length=200)[0]['summary_text']
summ

" Mexico, officially the United Mexican States, is a country in the southern portion of North America . With 126,014,024 inhabitants, it is the 10th-most-populous country and has the most Spanish-speakers . Mexico is organized as a federal republic comprising 31 states and Mexico City, its capital . The pre-Columbian Mexico traces its origins to 8,000 BCE and is identified as one of the world's six cradles of civilization . In 1521, the Spanish Empire and its indigenous allies toppled the Aztec Empire from its capital Tenochtitlan ."

## Import the Daily Mail Dataset

In [4]:
import tensorflow_datasets as tfds
import pandas as pd
from tensorflow_datasets.summarization import cnn_dailymail

In [5]:
cnn_builder = tfds.summarization.cnn_dailymail.CnnDailymail()
cnn_info = cnn_builder.info
cnn_builder.download_and_prepare()
datasets = cnn_builder.as_dataset()
train_dataset, test_dataset = datasets["train"], datasets["test"]
reviews = tfds.as_dataframe(train_dataset.take(10))
reviews = pd.DataFrame(reviews)

Downloading and preparing dataset 558.32 MiB (download: 558.32 MiB, generated: 1.28 GiB, total: 1.82 GiB) to ~/tensorflow_datasets/cnn_dailymail/3.3.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/287113 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/cnn_dailymail/3.3.0.incompleteH1V9XZ/cnn_dailymail-train.tfrecord*...:   0%|  …

Generating validation examples...:   0%|          | 0/13368 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/cnn_dailymail/3.3.0.incompleteH1V9XZ/cnn_dailymail-validation.tfrecord*...:   …

Generating test examples...:   0%|          | 0/11490 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/cnn_dailymail/3.3.0.incompleteH1V9XZ/cnn_dailymail-test.tfrecord*...:   0%|   …

Dataset cnn_dailymail downloaded and prepared to ~/tensorflow_datasets/cnn_dailymail/3.3.0. Subsequent calls will reuse this data.


In [6]:
reviews.head(3)

Unnamed: 0,article,highlights,publisher
0,"b""By. Associated Press. PUBLISHED:. 14:11 EST,...","b'Bishop John Folda, of North Dakota, is takin...",b'dm'
1,b'(CNN) -- Ralph Mata was an internal affairs ...,b'Criminal complaint: Cop used his role to hel...,b'cnn'
2,"b""A drunk driver who killed a young woman in a...","b""Craig Eccleston-Todd, 27, had drunk at least...",b'dm'


## Generate our own Summaries

In [8]:
reviews.article = reviews.article.astype('string')

In [9]:
reviews.dtypes

article       string
highlights    object
publisher     object
dtype: object

In [11]:
reviews['summary_text'] = reviews['article'].apply(lambda x : summarizer(x[0:1500], min_length =100, max_length=200)[0]['summary_text'])

In [12]:
reviews

Unnamed: 0,article,highlights,publisher,summary_text
0,"b""By. Associated Press. PUBLISHED:. 14:11 EST,...","b'Bishop John Folda, of North Dakota, is takin...",b'dm',Bishop John Folda of the Fargo Catholic Dioce...
1,b'(CNN) -- Ralph Mata was an internal affairs ...,b'Criminal complaint: Cop used his role to hel...,b'cnn',Ralph Mata was an internal affairs lieutenant...
2,"b""A drunk driver who killed a young woman in a...","b""Craig Eccleston-Todd, 27, had drunk at least...",b'dm',"Craig Eccleston-Todd, 27, was driving home fr..."
3,"b""(CNN) -- With a breezy sweep of his pen Pres...","b""Nina dos Santos says Europe must be ready to...",b'cnn',Russia's President Vladimir Putin signed away...
4,"b""Fleetwood are the only team still to have a ...",b'Fleetwood top of League One after 2-0 win at...,b'dm',Fleetwood only team still to have 100% record...
5,"b""He's been accused of making many a fashion f...",b'Prime Minister and his family are enjoying a...,b'dm',Prime Minister was seen in the same pair of b...
6,"b""By. Daily Mail Reporter. PUBLISHED:. 01:15 E...",b'NBA star calls for black and Hispanic commun...,b'dm',Basketball legend says he was 'devastated' wh...
7,b'By. Daily Mail Reporter. This is the moment ...,"b""London Midland service had been pulling into...",b'dm',London Midland service had been pulling into ...
8,"b""There are a number of job descriptions waiti...",b'Tony Pulis believes Saido Berahino should lo...,b'dm',Darren Fletcher signed for West Brom from Man...
9,"b'Canberra, Australia (CNN) -- At first glance...",b'Black box data from Flight 370 could be anal...,b'cnn',Australian Transport Safety Bureau's accident...


## Evaluate our Summaries using ROUGE-2

Here's an example of how to use ROUGE.

In [18]:
!pip install rouge --quiet

from rouge import Rouge

model_out = "he began by starting a five person war cabinet and included chamberlain as lord president of the council"
reference = "he began his premiership by forming a five-man war cabinet which included chamberlain as lord president of the council"

rouge = Rouge()
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 0.7368421052631579,
   'p': 0.7777777777777778,
   'f': 0.7567567517604091},
  'rouge-2': {'r': 0.5, 'p': 0.5294117647058824, 'f': 0.514285709289796},
  'rouge-l': {'r': 0.7368421052631579,
   'p': 0.7777777777777778,
   'f': 0.7567567517604091}}]

## Calculate the Average ROUGE-2 score across the first 20 article/summary pairs

In [21]:
r_list = []
for i in range(len(reviews)):
  r = rouge.get_scores(str(reviews['summary_text'][i]), str(reviews['highlights'][i]))[0]['rouge-2']['r']
  r_list.append(r)

In [22]:
r_list

[0.35294117647058826,
 0.3142857142857143,
 0.28125,
 0.020833333333333332,
 0.14754098360655737,
 0.15,
 0.2857142857142857,
 0.56,
 0.14814814814814814,
 0.038461538461538464]