In [15]:
import requests
from bs4 import BeautifulSoup
import difflib

# Function to search for a book by name and return the best match URL
def search_book_by_name(book_name):
    base_url = "https://www.gutenberg.org/"
    search_url = base_url + "ebooks/search/?query=" + book_name.replace(" ", "+") + "&submit_search=Go%21"

    response = requests.get(search_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the best match link based on similarity ratio
    best_match_ratio = 0
    best_match_url = ""

    for link in soup.find_all("li", class_="booklink"):
        link_title = link.find("span", class_="title").get_text()
        similarity_ratio = difflib.SequenceMatcher(None, book_name.lower(), link_title.lower()).ratio()
        if similarity_ratio > best_match_ratio:
            best_match_ratio = similarity_ratio
            best_match_url = base_url + link.find("a").get("href")

    return best_match_url

# Function to get the "Plain Text UTF-8" download link from the book page
def get_plain_text_link(book_url):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, "html.parser")

    plain_text_link = ""

    for row in soup.find_all("tr"):
        format_cell = row.find("td", class_="unpadded icon_save")
        if format_cell and "Plain Text UTF-8" in format_cell.get_text():
            plain_text_link = format_cell.find("a").get("href")
            break

    return plain_text_link


# Function to get the content of the "Plain Text UTF-8" link
def get_plain_text_content(plain_text_link):
    response = requests.get(plain_text_link)
    content = response.text
    return content

book_name = book_name =input("Book name")
best_match_url = search_book_by_name(book_name)


if best_match_url:
    plain_text_link = get_plain_text_link(best_match_url)
    if plain_text_link:
        full_plain_text_link = "https://www.gutenberg.org" + plain_text_link
        FileContent = get_plain_text_content(full_plain_text_link)
        # unstring "plain_text_content" to print the whole book
        print("Plain Text UTF-8 content:", FileContent)
    else:
        print("No Plain Text UTF-8 link found.")
else:
    print("No matching book found.")



# Tested book names
#     -The changed brides
#     -The bride's fate

Plain Text UTF-8 content: ﻿The Project Gutenberg eBook of Jane Eyre: An Autobiography
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Jane Eyre: An Autobiography


Author: Charlotte Brontë

Illustrator: F. H. Townsend

Release date: March 1, 1998 [eBook #1260]
                Most recently updated: May 2, 2023

Language: English

Credits: David Price


*** START OF THE PROJECT GUTENBERG EBOOK JANE EYRE: AN AUTOBIOGRAPHY ***



JANE EYRE
AN AUTOBIOGRAPHY

by Charlotte Brontë

_ILLUSTRATED BY F. H. TOWNSEND_

London
SERVICE & PATON
5 HENRIETTA STREET

In [16]:
text_split =  FileContent.split("*** START OF THE PROJECT GUTENBERG EBOOK")[1]
if len(text_split)==2:
  FileContent = text_split[1]
FileContent = FileContent.split("*** END OF THE PROJECT GUTENBERG EBOOK")[0]

In [17]:
# extract the sentences from the document
import nltk
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(FileContent)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
pip install transformers



In [19]:
from transformers import AutoTokenizer

# Define the model and tokenizer names
model_name = "pszemraj/led-large-book-summary"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [20]:
from transformers import pipeline
import torch

summarizer = pipeline("summarization", model=model_name, device=0 if torch.cuda.is_available() else -1)

In [21]:
# initialize

chunks = []
def chunkText(sentence):
  length = 0
  chunk = ""
  count = -1
  for sentence in sentences:
    count += 1
    combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

    if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
      chunk += sentence + " " # add the sentence to the chunk
      length = combined_length # update the length counter

      # if it is the last sentence
      if count == len(sentences) - 1:
        chunks.append(chunk.strip()) # save the chunk

    else:
      chunks.append(chunk.strip()) # save the chunk

      # reset
      length = 0
      chunk = ""

      # take care of the overflow sentence
      chunk += sentence + " "
      length = len(tokenizer.tokenize(sentence))


In [22]:
chunkText(sentences)

In [23]:
len(tokenizer.tokenize(FileContent))

Token indices sequence length is longer than the specified maximum sequence length for this model (303151 > 16384). Running this sequence through the model will result in indexing errors


303151

In [24]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [25]:
#Prediction
def give_summaries(chunks):
  summary_pipe = ""

  print("\nModel Summary:")


  for input in chunks:
    summary = summarizer(input, max_length=200, min_length=30,
                        no_repeat_ngram_size=3,
                        encoder_no_repeat_ngram_size=3,
                        repetition_penalty=3.5,
                        num_beams=4,
                        early_stopping=False,
                        do_sample=False)[0].get('summary_text')
    summary_pipe+= summary


  return summary_pipe


In [26]:
summary_pipe = give_summaries(chunks)


Model Summary:




In [29]:
#If generated summary also too long cannot handle by the model at once this loop will
while len(tokenizer.tokenize(summary_pipe))> tokenizer.max_len_single_sentence:   # IF generated summary size > maximum token length model can handle
  chunks = []
  chunkText(nltk.tokenize.sent_tokenize(summary_pipe))   #split summary into chunks
  summary_pipe = give_summaries(chunks)    # generate summary of of summary

#To generate summaries betwwen word limit(250 - 300)
while True:
  if(len(summary_pipe.split())>300):
      summary_pipe = summarizer(summary_pipe, max_length=300, min_length=270,
                       no_repeat_ngram_size=3,
                       encoder_no_repeat_ngram_size=3,
                       repetition_penalty=3.5,
                       num_beams=4,
                       do_sample=False)[0].get('summary_text')
  else:
    break





In [30]:
# Post-process the generated summary to remove if the last sentence is not complete
sentences = nltk.tokenize.sent_tokenize(summary_pipe)
if not sentences[-1].endswith("."):
  sentence_to_remove = sentences[-1]
  # Remove the specific sentence from the paragraph
  summary = summary_pipe.replace(sentence_to_remove, '')

print(summary)


The novel is available as an ebook from project Gutenberg, and the narrator explains why it is better than previous editions. The chapter title comes from the text of the book itself. This edition contains a lot more content than previous versions. It's also totally free to read anywhere. Jane graduates from Gateshead on January 15th. She attends Lowood school for girls who are orphans. There she meets Helen in the school and they talk about religion and eventually get married. They have a few months of happiness together before Jane leaves for Lowood with her new friends. Rochester tries to make Jane feel guilty about the way she treated Helen when she was sick, so he sends her back to the orphanage to be with her family. At Lowood there are lots of lonely nights where Jane feels completely stranded, but at least she gets to draw portraits of Mrs. Rochester and write letters to her father telling him how wonderful Thornfield is. Then one day she hears Rochester laughing at her drawing