# Document Summarization with Huggingface LLMs

- https://python.langchain.com/docs/integrations/llms/huggingface_pipelines
- https://huggingface.co/models?pipeline_tag=summarization

**Models:**
- https://huggingface.co/facebook/bart-large-cnn
- https://huggingface.co/google/pegasus-cnn_dailymail
- https://huggingface.co/Falconsai/text_summarization

In [1]:
import os
import pandas as pd
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredHTMLLoader, UnstructuredPDFLoader
from langchain.chains.summarize import load_summarize_chain
from functools import partial
from langchain_core.prompts import format_document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [3]:
documents = []
for file in os.listdir('data/test_documents'):
    if file.endswith('.html'):
        doc_path = './data/test_documents/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())
    #elif file.endswith('.pdf'):
    #    pdf_path = './data/test_documents/' + file
    #    print(f'Loading {pdf_path}')
    #    loader = UnstructuredPDFLoader(pdf_path)
    #    documents.extend(loader.load())

Loading ./data/test_documents/Master Thesis Information.html
Loading ./data/test_documents/General Questions MMDS.html
Loading ./data/test_documents/Learning Agreements MMDS.html
Loading ./data/test_documents/Recognition of Coursework and Examinations Master Business Informatics.html
Loading ./data/test_documents/General Questions Master Business Informatics.html
Loading ./data/test_documents/MMDS info start page.html
Loading ./data/test_documents/Recognition of Coursework and Examinations MMDS.html
Loading ./data/test_documents/Master Business Informatics info start page.html
Loading ./data/test_documents/Extension of Deadlines MMDS.html
Loading ./data/test_documents/Learning Agreements Master Business Informatics.html
Loading ./data/test_documents/Extension of Deadlines Master Business Infromatics.html


In [4]:
bart = HuggingFacePipeline.from_model_id(
    model_id="facebook/bart-large-cnn",
    task="summarization",
    device_map="auto",
)

#pegasus = HuggingFacePipeline.from_model_id(
#    model_id="google/pegasus-cnn_dailymail",
#    task="summarization",
#    device_map="auto",
#)

t5 = HuggingFacePipeline.from_model_id(
    model_id="Falconsai/text_summarization",
    task="summarization",
    device_map="auto",
)

gpt = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125")

2024-02-13 11:44:59.726176: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-13 11:44:59.767753: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-13 11:44:59.767792: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-13 11:44:59.767817: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-13 11:44:59.776517: I tensorflow/core/platform/cpu_feature_g

In [6]:
def get_summary(docs, llm, max_tokens):
    # Prompt and method for converting Document -> str.
    document_prompt = PromptTemplate.from_template("{page_content}")
    partial_format_document = partial(format_document, prompt=document_prompt)

    # A text splitter that recursively splits a document into multiple chunks until
    # the maximum chunck size is below a predefined value (without overlap).
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = max_tokens,
        chunk_overlap  = 0,
        length_function = llm.get_num_tokens,
        is_separator_regex = False,
    )

    # The chain we'll apply to each individual document.
    # Returns a summary of the document.
    map_chain = (
        {"context": partial_format_document}
        | PromptTemplate.from_template("Summarize this content:\n\n{context}")
        | llm
        | StrOutputParser()
    )

    # A wrapper chain to keep the original Document metadata
    map_as_doc_chain = (
        RunnableParallel({"doc": RunnablePassthrough(), "content": map_chain})
        | (lambda x: Document(page_content=x["content"], metadata=x["doc"].metadata))
    ).with_config(run_name="Summarize (return doc)")


    # The chain we'll repeatedly apply to collapse subsets of the documents
    # into a consolidate document until the total token size of our
    # documents is below some max size.
    def format_docs(docs):
        return "\n\n".join(partial_format_document(doc) for doc in docs)

    collapse_chain = (
        {"context": format_docs}
        | PromptTemplate.from_template("Collapse this content:\n\n{context}")
        | llm
        | StrOutputParser()
    )

    def get_num_tokens(docs):
        return llm.get_num_tokens(format_docs(docs))

    def collapse(
        docs,
        config,
        token_max=max_tokens,
    ):
        collapse_ct = 1
        while get_num_tokens(docs) > token_max:
            config["run_name"] = f"Collapse {collapse_ct}"
            invoke = partial(collapse_chain.invoke, config=config)
            split_docs = split_list_of_docs(docs, get_num_tokens, token_max)
            docs = [collapse_docs(_docs, invoke) for _docs in split_docs]
            collapse_ct += 1
        return docs

    # The chain we'll use to combine our individual document summaries
    # (or summaries over subset of documents if we had to collapse the map results)
    # into a final summary.

    reduce_chain = (
        {"context": format_docs}
        | PromptTemplate.from_template("Combine these summaries:\n\n{context}")
        | llm
        | StrOutputParser()
    ).with_config(run_name="Reduce")


    # The final full chain for summarizing documents
    map_reduce_summarizer = (text_splitter.split_documents | map_as_doc_chain.map() | collapse | reduce_chain).with_config(
        run_name="Map reduce"
    )

    docs_summarized = []
    for i, doc in enumerate(docs):
        print(f"Processing document {i}/{len(docs)-1}")
        summary = map_reduce_summarizer.invoke([doc])
        summary_doc = Document(page_content=summary, metadata=doc.metadata)
        docs_summarized.append(summary_doc)
    print("done!")

    return docs_summarized

In [7]:
doc_summary_bart = get_summary(documents, llm=bart, max_tokens=1000)

Processing document 0/10


Your max_length is set to 142, but you input_length is only 72. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Token indices sequence length is longer than the specified maximum sequence length for this model (2646 > 1024). Running this sequence through the model will result in indexing errors


Processing document 1/10


Your max_length is set to 142, but you input_length is only 72. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max_length is set to 142, but you input_length is only 57. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 142, but you input_length is only 17. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


Processing document 2/10
Processing document 3/10
Processing document 4/10
Processing document 5/10
Processing document 6/10


Your max_length is set to 142, but you input_length is only 72. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


Processing document 7/10
Processing document 8/10
Processing document 9/10
Processing document 10/10
done!


In [8]:
#doc_summary_pegasus = get_summary(documents, llm=pegasus, max_tokens=1000)

In [9]:
doc_summary_t5 = get_summary(documents, llm=t5, max_tokens=1000)

Processing document 0/10


Your max_length is set to 200, but you input_length is only 100. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 200, but you input_length is only 100. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 200, but you input_length is only 178. You might cons

Processing document 1/10


Token indices sequence length is longer than the specified maximum sequence length for this model (912 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 200, but you input_length is only 150. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 200, but you input_length is only 63. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
Your max_length is set to 200, but you input_length is only 24. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 200, but you input_length is only 17. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 200, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 200, but you input_length i

Processing document 2/10


Your max_length is set to 200, but you input_length is only 125. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)


Processing document 3/10
Processing document 4/10


Your max_length is set to 200, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 200, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 200, but you input_length is only 178. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=89)
Your max_length is set to 200, but you input_length is only 125. You might consi

Processing document 5/10


Your max_length is set to 200, but you input_length is only 94. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your max_length is set to 200, but you input_length is only 114. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 200, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 82. You might consid

Processing document 6/10


Your max_length is set to 200, but you input_length is only 142. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)


Processing document 7/10


Your max_length is set to 200, but you input_length is only 192. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 200, but you input_length is only 31. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 200, but you input_length is only 178. You might consi

Processing document 8/10


Your max_length is set to 200, but you input_length is only 54. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but you input_length is only 168. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)


Processing document 9/10
Processing document 10/10
done!


In [11]:
doc_summary_gpt = get_summary(documents, llm=gpt, max_tokens=16000)

Processing document 0/10
Processing document 1/10
Processing document 2/10
Processing document 3/10
Processing document 4/10
Processing document 5/10
Processing document 6/10
Processing document 7/10
Processing document 8/10
Processing document 9/10
Processing document 10/10
done!


In [12]:
summaries_df = pd.DataFrame({
    "doc": [d.metadata["source"] for d in doc_summary_gpt],
    "bart_summary": [d.page_content for d in doc_summary_bart],
    #"pegasus_summary": [d.page_content for d in doc_summary_pegasus],
    "t5_summary": [d.page_content for d in doc_summary_t5],
    "gpt_summary": [d.page_content for d in doc_summary_gpt]
})

summaries_df.head()

Unnamed: 0,doc,bart_summary,t5_summary,gpt_summary
0,./data/test_documents/Master Thesis Informatio...,You are generally required to demonstrate spec...,Combine these summaries: You are generally req...,This content outlines the requirements and pro...
1,./data/test_documents/General Questions MMDS.html,The Mannheim Master in Data Science program eq...,Mannheim Master in Data Science covers six maj...,The content covers the Mannheim Master in Data...
2,./data/test_documents/Learning Agreements MMDS...,Mannheim Master in Data Science program equips...,Mannheim Master in Data Science Learning Agree...,The Mannheim Master in Data Science program of...
3,./data/test_documents/Recognition of Coursewor...,The master’s program in Business Informatics i...,Combine these summaries: Please verify that th...,This content covers the recognition of coursew...
4,./data/test_documents/General Questions Master...,The master’s program in Business Informatics i...,Combine these summaries: The examination regul...,This content provides comprehensive informatio...


In [13]:
summaries_df.to_csv("data/summaries.csv", index=False)