# Import

In [3]:
# import internal modules
import re
import requests
import time

# import 3rd-party modules
from bs4 import BeautifulSoup as bs
import pandas as pd

# import local modules

# Load book

In [4]:
# book html link
# book_text_link = "https://www.gutenberg.org/cache/epub/103/pg103.txt"
book_html_link = "https://www.gutenberg.org/files/103/103-h/103-h.htm"

# scrape book page
response = requests.get(book_html_link)
print(book_html_link, response.status_code)

# scrape
soup = bs(response.text, 'html.parser')
results = soup.find_all(['h4', 'p'])
text = [result.text for result in results]
book = ' '.join(text)
book[:1000]

https://www.gutenberg.org/files/103/103-h/103-h.htm 200


'\r\nIN WHICH PHILEAS FOGG AND PASSEPARTOUT ACCEPT EACH OTHER, \r\nTHE ONE AS\r\nMASTER, THE OTHER AS MAN\r\n \r\nMr. Phileas Fogg lived, in 1872, at No. 7, Saville Row, Burlington\r\nGardens, the house in which Sheridan died in 1814.  He was one of the\r\nmost noticeable members of the Reform Club, though he seemed always to\r\navoid attracting attention; an enigmatical personage, about whom little\r\nwas known, except that he was a polished man of the world.  People said\r\nthat he resembled Byron—at least that his head was Byronic; but he was\r\na bearded, tranquil Byron, who might live on a thousand years without\r\ngrowing old.\r\n \r\nCertainly an Englishman, it was more doubtful whether Phileas Fogg was\r\na Londoner.  He was never seen on \'Change, nor at the Bank, nor in the\r\ncounting-rooms of the "City"; no ships ever came into London docks of\r\nwhich he was the owner; he had no public employment; he had never been\r\nentered at any of the Inns of Court, either at the Temp

In [4]:
print(book[:1000])


IN WHICH PHILEAS FOGG AND PASSEPARTOUT ACCEPT EACH OTHER, 
THE ONE AS
MASTER, THE OTHER AS MAN
 
Mr. Phileas Fogg lived, in 1872, at No. 7, Saville Row, Burlington
Gardens, the house in which Sheridan died in 1814.  He was one of the
most noticeable members of the Reform Club, though he seemed always to
avoid attracting attention; an enigmatical personage, about whom little
was known, except that he was a polished man of the world.  People said
that he resembled Byron—at least that his head was Byronic; but he was
a bearded, tranquil Byron, who might live on a thousand years without
growing old.
 
Certainly an Englishman, it was more doubtful whether Phileas Fogg was
a Londoner.  He was never seen on 'Change, nor at the Bank, nor in the
counting-rooms of the "City"; no ships ever came into London docks of
which he was the owner; he had no public employment; he had never been
entered at any of the Inns of Court, either at the Temple, or Lincoln's
Inn, or Gray's Inn; 


# Prepare book for summarization

In [9]:
# find index position where metadata from website ends
metadata_end_idx = book.rfind("***",0,1000)
book_text = book[metadata_end_idx + 3:]
print(book_text[: 512])















AROUND THE WORLD IN EIGHTY DAYS



CONTENTS


CHAPTER

      I  IN WHICH PHILEAS FOGG AND PASSEPARTOUT ACCEPT EACH OTHER, THE
         ONE AS MASTER, THE OTHER AS MAN

     II  IN WHICH PASSEPARTOUT IS CONVINCED THAT HE HAS AT LAST FOUND
         HIS IDEAL

    III  IN WHICH A CONVERSATION TAKES PLACE WHICH SEEMS LIKELY TO COST
         PHILEAS FOGG DEAR

     IV  IN WHICH PHILEAS FOGG ASTOUNDS PASSEPARTOUT, HIS SERVANT

      V  IN WHICH A NEW SPECIES OF FUND


# Summarize book
## with Bart transformers

In [10]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# tokenize without truncation
inputs_no_trunc = tokenizer(summary, max_length=None, return_tensors='pt', truncation=False)

# get batches of tokens corresponding to the exact model_max_length
chunk_start = 0
chunk_end = tokenizer.model_max_length  # == 1024 for Bart
inputs_batch_lst = []
while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
    inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end]  # get batch of n tokens
    inputs_batch = torch.unsqueeze(inputs_batch, 0)
    inputs_batch_lst.append(inputs_batch)
    chunk_start += tokenizer.model_max_length  # == 1024 for Bart
    chunk_end += tokenizer.model_max_length  # == 1024 for Bart

# generate a summary on each batch
summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=50, early_stopping=True) for inputs in inputs_batch_lst]

# decode the output and join into one string with one paragraph per summary batch
summary_batch_lst = []
for summary_id in summary_ids_lst:
    summary_batch = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_id]
    summary_batch_lst.append(summary_batch[0])
summary_all = '\n'.join(summary_batch_lst)

print(summary_all)

Token indices sequence length is longer than the specified maximum sequence length for this model (24793 > 1024). Running this sequence through the model will result in indexing errors
Phileas Fogg lived, in 1872, at No. 7, Saville Row, Burlington. He was not known to have either wife or children, which may have been fixed, in the same room, at the same
"I'd like to see you do it in eighty days," said Phileas Fogg. "Deal over again, then," said Stuart Sydenham. "I have a strange way of proving that the unforeseen does not exist
Phileas Fogg, the bank robber, was sent to find a steamer to take him to the New World. The journey would take ten years, and would take him from Suez to Aden, at the other end of
Passepartout, who lived a solitary existence in London, was said to be rich, though no wealthier than Phileas Fogg. He was on a pretended tour of the world, pretending to make the tour in eighty days
Phileas Fogg was travelling around the world in eighty days. His whist partner Passepa

In [13]:
with open("summary_of_summary_around_the_world.txt", "w") as file:
    file.write(summary_all)

In [1]:
from gensim.summarization import summarize

In [6]:
    # With gensim

    # start timer
    start_time = time.time()

    # pass text corpus to summarizer
    summary = summarize(book)

    # stop timer and compute the execution time
    end_time = time.time()
    diff_time = (end_time - start_time)
    print(f"\nTime to summarize: {diff_time:.2f} seconds")
    
    # export to txt file
    with open("extractive_summary_around_the_world.txt", "w") as file:
        file.write(summary)


Time to summarize: 34.14 seconds
