In [None]:
%pip install transformers --quiet
!pip install langchain
!pip install -U sentence-transformers
!pip install gradio

# Import libraries

In [2]:
import urllib
import warnings
from pathlib import Path as p
import pandas as pd
import requests
from bs4 import BeautifulSoup
import difflib
import re
from langchain import HuggingFaceHub
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
import numpy as np
from sklearn.cluster import KMeans
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import warnings
from warnings import simplefilter


warnings.filterwarnings("ignore")

# Import the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")

map_chain = pipeline(
        task="summarization",
        model="pszemraj/long-t5-tglobal-base-16384-book-summary",
        tokenizer=tokenizer,
        max_length=300,
    )


In [4]:
def getbook(bookname):
        # Function to search for a book by name and return the best match URL
        def search_book_by_name(book_name):
            base_url = "https://www.gutenberg.org/"
            search_url = base_url + "ebooks/search/?query=" + book_name.replace(" ", "+") + "&submit_search=Go%21"

            response = requests.get(search_url)
            soup = BeautifulSoup(response.content, "html.parser")

            # Find the best match link based on similarity ratio
            best_match_ratio = 0
            best_match_url = ""

            for link in soup.find_all("li", class_="booklink"):
                link_title = link.find("span", class_="title").get_text()
                similarity_ratio = difflib.SequenceMatcher(None, book_name.lower(), link_title.lower()).ratio()
                if similarity_ratio > best_match_ratio:
                    best_match_ratio = similarity_ratio
                    best_match_url = base_url + link.find("a").get("href")

            return best_match_url

        # Function to get the "Plain Text UTF-8" download link from the book page
        def get_plain_text_link(book_url):
            response = requests.get(book_url)
            soup = BeautifulSoup(response.content, "html.parser")

            plain_text_link = ""

            for row in soup.find_all("tr"):
                format_cell = row.find("td", class_="unpadded icon_save")
                if format_cell and "Plain Text UTF-8" in format_cell.get_text():
                    plain_text_link = format_cell.find("a").get("href")
                    break

            return plain_text_link


        # Function to get the content of the "Plain Text UTF-8" link
        def get_plain_text_content(plain_text_link):
            response = requests.get(plain_text_link)
            content = response.text
            return content



        # book_name = input("Enter the name of the book: ")
        book_name = bookname
        best_match_url = search_book_by_name(book_name)

        if best_match_url:
            plain_text_link = get_plain_text_link(best_match_url)
            if plain_text_link:
                full_plain_text_link = "https://www.gutenberg.org" + plain_text_link
                plain_text_content = get_plain_text_content(full_plain_text_link)
                # unstring "plain_text_content" to print the whole book
        #         print("Plain Text UTF-8 content:", plain_text_content)
                book_text = plain_text_content
            else:
                print("No Plain Text UTF-8 link found.")
        else:
            print("No matching book found.")

        return book_text


# Tested book names
#     -The changed brides
#     -The bride's fate
#     -Jane Eyre: An Autobiography by Charlotte BrontÃ«

In [5]:
def cleanText(book_text):

          cleaned_text = book_text.replace('\r', '').replace('\n', '')
          cleaned_text = re.sub(r'\\u[a-fA-F0-9]+', '', cleaned_text)

          num_tokens = tokenizer.encode(cleaned_text, add_special_tokens=True)

          text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
          chunks = text_splitter.create_documents([cleaned_text])

          embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L12-v2")
          vectors = embeddings.embed_documents([x.page_content for x in chunks])

          num_clusters = 10
          # Perform K-means clustering
          kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)
          # Filter out FutureWarnings
          simplefilter(action='ignore', category=FutureWarning)
          # Perform t-SNE and reduce to 2 dimensions
          tsne = TSNE(n_components=2, perplexity=5, random_state=42)
          vectors_array = np.array(vectors)
          reduced_data_tsne = tsne.fit_transform(vectors_array)

          closest_indices = []
          # Loop through the number of clusters you have
          for i in range(num_clusters):
              # Get the list of distances from that particular cluster center
              distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
              # Find the list position of the closest one (using argmin to find the smallest distance)
              closest_index = np.argmin(distances)
              # Append that position to your closest indices list
              closest_indices.append(closest_index)

          selected_indices = sorted(closest_indices)
          selected_docs = [chunks[doc] for doc in selected_indices]

          return selected_docs

def genSummary(docs):

    selected_docs = docs
    summary_list = []
    # Loop through a range of the lenght of your selected docs
    for i, doc in enumerate(selected_docs):
            # Go get a summary of the chunk
            # chunk_summary = map_chain.run([doc])
            chunk_summary = map_chain(
                    doc.page_content,
                    min_length=100,
                    max_length=200,
                    no_repeat_ngram_size=3,
                    encoder_no_repeat_ngram_size=3,
                    repetition_penalty=3.5,
                    num_beams=4,
                    early_stopping=True,
                )
            summary_list.append(chunk_summary)
            # print (f"Summary #{i} (chunk #{selected_indices[i]}) - Preview: {chunk_summary[:250]} \n")


    summary_list2 = []
    for i in range(len(summary_list)):
         summary_list2.append(summary_list[i][0]["summary_text"])

    summaries = "\n".join(summary_list2)

    # Convert it back to a document
    summaries = Document(page_content=summaries)
    num_tokens_sum = tokenizer.encode(summaries.page_content, add_special_tokens=True)
    output = map_chain(
            summaries.page_content,
            min_length=100,
            max_length=200,
        )

    print(output)
    print(output[0]["summary_text"])
    output[0]["summary_text"]

In [None]:
booktxt = getbook("The changed brides")
selec_Docs = cleanText(booktxt)
sum = genSummary(selec_Docs)

In [7]:
# !pip install gradio

In [8]:
import gradio as gr

In [9]:
title = "GutenbergChat Summarization"

# Summary
def get_summary(book_name):
    if not book_name:
        return "Please enter the name of the book."

    # load book
    booktxt = getbook(book_name)
    selec_Docs = cleanText(booktxt)
    sum = genSummary(selec_Docs)

    return f"Book Summary: {book_summary}\n"

summaryBot = gr.Interface(fn=get_summary, inputs="text", outputs="text", title=title + " - Summary")
demo2 = gr.TabbedInterface([summaryBot], ["Summary"])
demo2.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://76e57fc59a83bca90a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


