In [1]:
# %pip install langchain

Collecting langchain
  Downloading langchain-0.0.310-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.40 (from langchain)
  Downloading langsmith-0.0.43-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonpatch, langsmith, langchain
  Attempting uninstall: jsonpatch
    Found existing installation: jsonpatch 1.32
    Uninstalling jsonpatch-1.32:
      Successfully uninstalled jsonpatch-1.32
Successfully installed jsonpatch-1.33 langchain-0.0.310 langsmith-0.0.43


In [2]:
from langchain.document_loaders import GutenbergLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from typing import List

MODEL = "pszemraj/led-large-book-summary"




In [3]:
def get_summary(book_id: str):
    print("GPU: ", torch.cuda.is_available())
    url = get_url(book_id)
    docs = get_chunks(url)
    summary = generate_summary(docs)
    return {"summary": summary}

def get_url(book_id):
    return f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"

In [4]:
# fetch ebook and split into chunks (docs)
def get_chunks(url):
    loader = GutenbergLoader(url)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=300,
        separators=["\r\n\n\n\r\n\n\n", "\r\n\n\n", "."],
    )

    text = loader.load()[0].page_content

    # remove PROJECT GUTENBERG header and footer sections
    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
    start_index = text.find(start_marker)
    start_end_index = text.find("***", start_index + len(start_marker))
    end_index = text.find(end_marker)
    text = text[start_end_index + 3 : end_index]

    # splitting
    docs = text_splitter.create_documents([text])
    for i in range(len(docs)):
        docs[i].page_content = docs[i].page_content.replace("\r\n\n\n", " ")
    print("created %d chunks." % len(docs))
    return docs

In [5]:
def generate_summary(docs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL,
        # load_in_8bit=True,
        # low_cpu_mem_usage=True,
    ).to(device)
    summarizer = pipeline(
        task="summarization",
        model=model,
        tokenizer=tokenizer,
        pad_token_id=tokenizer.eos_token_id,
        # temperature = 0.2,
        device=device,
    )
    n_tokens = len(tokenizer.encode(docs[0].page_content) * len(docs))
    max_chunk_sum_length = 15000 // len(docs)
    chunk_sums = []
    print("Starting chunk summarization...")
    # chunk_summaries = {}
    initial_summary = ""
    for i in range(len(docs)):
        if len(tokenizer.encode(docs[i].page_content)) < max_chunk_sum_length:
            initial_summary += docs[i].page_content + "\n"
            chunk_sums.append(docs[i].page_content)
            continue
        chunk_summary = summarizer(
            docs[i].page_content,
            min_length=100,
            max_length=max_chunk_sum_length,
#             no_repeat_ngram_size=3,
#             encoder_no_repeat_ngram_size=3,
#             repetition_penalty=3.5,
#             num_beams=4,
#             early_stopping=True,
        )[0]["summary_text"]
        print("chunk %d summerized." % i)
        # chunk_summaries[i]= chunk_summary
        chunk_sums.append(chunk_summary)
        initial_summary += chunk_summary + "\n"
        # for now just send the first summary
        # return initial_summary
    
    chunks_token_size = len(tokenizer.encode(initial_summary))
    print("Chunk summarization completed. With a tokensize of ", chunks_token_size)
    mid_sum_len = chunks_token_size // 2
    print("Generating mid summary of length, " mid_sum_len)
    mid_summary1 = summarizer(
        initial_summary,
        min_length=100,
        max_length=mid_sum_len,
#         no_repeat_ngram_size=3,
#         encoder_no_repeat_ngram_size=3,
#         repetition_penalty=3.5,
        num_beams=4,
#         early_stopping=True,
    )[0]["summary_text"]
    chunk_sums.append(mid_summary1)
    print("Generating final summary")
    summary = summarizer(
        mid_summary1,
        min_length=100,
        max_length=300,
#         no_repeat_ngram_size=3,
#         encoder_no_repeat_ngram_size=3,
#         repetition_penalty=3.5,
        num_beams=4,
#         early_stopping=True,
    )[0]["summary_text"]
    print("Final summary generated.")
    # summary = summarizer(
    #     docs[1].page_content,
    #     # final_summary,
    #     min_length=200,
    #     max_length=400,
    #     no_repeat_ngram_size=3,
    #     encoder_no_repeat_ngram_size=3,
    #     repetition_penalty=3.5,
    #     num_beams=4,
    #     early_stopping=True,
    # )[0]['summary_text']
    return summary, chunk_sums

In [None]:
def generate_summary_rec(docs):
    

In [10]:
url = get_url(12)

In [11]:
docs = get_chunks(url)

created 19 chunks.


[2945,
 2574,
 2828,
 2609,
 2868,
 2766,
 2948,
 2872,
 2769,
 2723,
 3036,
 2666,
 2783,
 2688,
 2805,
 2871,
 2656,
 2570,
 334]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [12]:
len(tokenizer.encode(docs[0].page_content))

2945

### Average token length of a chunk

In [32]:
chunk_token_lens = []
for i in range(10,50):
    try:
        temp_docs = get_chunks(get_url(i))
        cur_lens = [len(tokenizer.encode(doc.page_content)) for doc in temp_docs]
        chunk_token_lens.extend(cur_lens)
    catch

created 479 chunks.
created 16 chunks.
created 19 chunks.
created 4 chunks.
created 214 chunks.
created 136 chunks.
created 29 chunks.
created 161 chunks.
created 144 chunks.
created 22 chunks.
created 58 chunks.
created 25 chunks.
created 177 chunks.
created 29 chunks.
created 33 chunks.
created 247 chunks.
created 70 chunks.
created 86 chunks.
created 9 chunks.
created 50 chunks.
created 500 chunks.
created 26 chunks.
created 33 chunks.
created 55 chunks.
created 18 chunks.
created 21 chunks.
created 38 chunks.
created 52 chunks.
created 122 chunks.
created 6 chunks.


HTTPError: HTTP Error 406: Not Acceptable

In [28]:
import numpy as np        

In [29]:
len(chunk_token_lens)

1710

In [30]:
np.mean(chunk_token_lens)

2647.9473684210525

In [8]:
summary, chunk_sums = generate_summary(docs)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

Starting chunk summarization...
chunk 0 summerized.
chunk 1 summerized.
chunk 2 summerized.
chunk 3 summerized.
chunk 4 summerized.
chunk 5 summerized.
chunk 6 summerized.
chunk 7 summerized.
chunk 8 summerized.
chunk 9 summerized.




chunk 10 summerized.
chunk 11 summerized.
chunk 12 summerized.
chunk 13 summerized.
chunk 14 summerized.
chunk 15 summerized.
Chunk summarization completed. With a tokensize of  5016
Generating final summary
Final summary generated.


In [9]:
summary

'In Chapter 1 of Lewis Carroll\'s Multimedia edition of Alice\'s Adventures in Wonderland, presented in three chapters titled "The Millenium Funcrum EDITION 3.0," Alice finds herself at the bottom of a very deep well. After following a White Rabbit down the rabbit hole, she comes across doors all around her but all of them are locked. She comes across a bottle labeled Drink Me, but on closer inspection discovers that it does not contain any poison. Having read stories about children who had drunk from bottles marked "poison," Alice tastes the alcohol and decides that it is not as bad as she had originally thought. She then proceeds to drink more of it. The first chapter is entitled Alice\'s Falling down a rabbit hole. In this chapter, Carroll introduces many of the themes that would later be introduced in the later volumes of the novel. A Caucus-Race and a Long Tale The Dodo proposes a Caucus-race, i.e., a race in which everyone must run around a circular course in order to dry themsel

In [24]:
chunk_sums[11]

'The Duchess finds morals in everything, including the flamingoes and mustard. She tries to teach them to Alice by explaining that everything has a moral and squeezing herself close to Alice\'s side as she speaks. The conversation goes on for a while, until the Duchess digresses into a long speech about how love makes the world go round. This is followed by a Gryphon, whom the Queen sends to take Alice to see the Mock Turtle. In the croquet-ground, the King, the Queen, and Alice are all playing the game of Pick-A-Prayer. At the end of the first half hour, all the arches have been set free, and all the players except Alice, who is under sentence of execution, have been taken into custody by the soldiers. The Queen leaves off quarreling with the other players and asks Alice if she has ever seen the Mock Turtles Soup. Alice says no, and the Queen says that she will show Alice his history. As they walk off together, the king says, "You are all pardoned" , which angers the Queen because she

In [None]:
docs[0]

In [17]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [19]:
tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-large-book-summary")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [52]:
n_tokens = len(tokenizer.encode(docs[0].page_content) * len(docs))
chunk_sum_length = 15000 // len(docs)

In [53]:
chunk_sum_length

3750

In [89]:
!pip install textsum

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting textsum
  Downloading textsum-0.2.0-py3-none-any.whl (29 kB)
Collecting clean-text (from textsum)
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting fire (from textsum)
  Downloading fire-0.5.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting natsort (from textsum)
  Downloading natsort-8.4.0-py3-none-any.whl (38 kB)
Collecting emoji<2.0.0,>=1.0.0 (from clean-text->textsum)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:0

In [90]:
from textsum.summarize import Summarizer

model_name = "pszemraj/led-large-book-summary"
summarizer = Summarizer(
    model_name_or_path=model_name,  # you can use any Seq2Seq model on the Hub
    token_batch_length=15000,  # tokens to batch summarize at a time, up to 16384
)
long_string = "\n".join([doc.page_content for doc in docs])
out_str = summarizer.summarize_string(long_string)
print(f"summary: {out_str}")

Generating Summaries:   0%|          | 0/3 [00:00<?, ?it/s]

summary: Alice is sitting on the bank of a river with her older sister and is bored. Suddenly, a White Rabbit runs by her and stops to take out a watch from its pocket. Having never seen one, Alice follows it and falls down a rabbit hole. After falling for a while, she finds herself in a hallway lined with doors. The doors are all locked, so Alice tries and fails to open them. On the second try, she does manage to open one door, which leads to a garden. In the garden, she meets lots of animals, including a dog, a puppy, a mouse, and a bunch of pebbles. She decides to try and figure out how to get rid of some of them, but then realizes that they are all too big. Finally, she comes across a pile of little cakes, which she eats one of. She shrinks down to the size she used to be, and then goes on to explore the rest of the garden. All of the cakes turn out to be pieces of the dead end of a long rope ladder that has been strung across the room. Alice finally gets back to the top-down part 