# Document Summarization with Huggingface LLMs

- https://python.langchain.com/docs/integrations/llms/huggingface_pipelines
- https://huggingface.co/models?pipeline_tag=summarization

In [None]:
import os
import pandas as pd
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredHTMLLoader, UnstructuredPDFLoader
from langchain.chains.summarize import load_summarize_chain
from functools import partial
from langchain_core.prompts import format_document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

In [None]:
documents = []
for file in os.listdir('data/test_documents'):
    if file.endswith('.pdf'):
        pdf_path = './data/test_documents/' + file
        print(f'Loading {pdf_path}')
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        doc_path = './data/test_documents/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())

In [None]:
llm = HuggingFacePipeline.from_model_id(
    model_id="facebook/bart-large-cnn",
    task="summarization",
    pipeline_kwargs={"max_new_tokens": 100},
)

In [None]:
#hf = HuggingFacePipeline.from_model_id(
#    model_id="Falconsai/text_summarization",
#    task="summarization",
#    pipeline_kwargs={"max_new_tokens": 10},
#)

In [None]:
document_prompt = PromptTemplate.from_template("{page_content}")
partial_format_document = partial(format_document, prompt=document_prompt)
def get_num_tokens_single_doc(doc):
    return llm.get_num_tokens(partial_format_document(doc))

for doc in documents:
    print(get_num_tokens_single_doc(doc))

In [None]:
def get_summary(docs, llm, max_tokens):
    # Prompt and method for converting Document -> str.
    document_prompt = PromptTemplate.from_template("{page_content}")
    partial_format_document = partial(format_document, prompt=document_prompt)

    # A text splitter that recursively splits a document into multiple chunks until
    # the maximum chunck size is below a predefined value (without overlap).
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = max_tokens,
        chunk_overlap  = 0,
        length_function = llm.get_num_tokens,
        is_separator_regex = False,
    )

    # The chain we'll apply to each individual document.
    # Returns a summary of the document.
    map_chain = (
        {"context": partial_format_document}
        | PromptTemplate.from_template("Summarize this content:\n\n{context}")
        | llm
        | StrOutputParser()
    )

    # A wrapper chain to keep the original Document metadata
    map_as_doc_chain = (
        RunnableParallel({"doc": RunnablePassthrough(), "content": map_chain})
        | (lambda x: Document(page_content=x["content"], metadata=x["doc"].metadata))
    ).with_config(run_name="Summarize (return doc)")


    # The chain we'll repeatedly apply to collapse subsets of the documents
    # into a consolidate document until the total token size of our
    # documents is below some max size.
    def format_docs(docs):
        return "\n\n".join(partial_format_document(doc) for doc in docs)

    collapse_chain = (
        {"context": format_docs}
        | PromptTemplate.from_template("Collapse this content:\n\n{context}")
        | llm
        | StrOutputParser()
    )

    def get_num_tokens(docs):
        return llm.get_num_tokens(format_docs(docs))

    def collapse(
        docs,
        config,
        token_max=max_tokens,
    ):
        collapse_ct = 1
        while get_num_tokens(docs) > token_max:
            config["run_name"] = f"Collapse {collapse_ct}"
            invoke = partial(collapse_chain.invoke, config=config)
            split_docs = split_list_of_docs(docs, get_num_tokens, token_max)
            docs = [collapse_docs(_docs, invoke) for _docs in split_docs]
            collapse_ct += 1
        return docs

    # The chain we'll use to combine our individual document summaries
    # (or summaries over subset of documents if we had to collapse the map results)
    # into a final summary.

    reduce_chain = (
        {"context": format_docs}
        | PromptTemplate.from_template("Combine these summaries:\n\n{context}")
        | llm
        | StrOutputParser()
    ).with_config(run_name="Reduce")


    # The final full chain for summarizing documents
    map_reduce_summarizer = (text_splitter.split_documents | map_as_doc_chain.map() | collapse | reduce_chain).with_config(
        run_name="Map reduce"
    )

    docs_summarized = []
    for i, doc in enumerate(docs):
        print(f"Processing document {i}/{len(docs)-1}")
        summary = map_reduce_summarizer.invoke([doc])
        summary_doc = Document(page_content=summary, metadata=doc.metadata)
        docs_summarized.append(summary_doc)
    print("done!")

    return docs_summarized

In [None]:
doc_summary_bart = get_summary(documents, llm=llm, max_tokens=1000)

In [None]:
doc_summary_bart

In [None]:
llm2 = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125")

In [None]:
doc_summary_gpt_3_5 = get_summary(documents, llm=llm2, max_tokens=1000)

In [None]:
doc_summary_gpt_3_5

In [None]:
doc_summary_gpt_3_5[0].metadata["source"]

In [None]:
summaries_df = pd.DataFrame({
    "doc": [d.metadata["source"] for d in doc_summary_gpt_3_5],
    "bart_summary": [d.page_content for d in doc_summary_bart],
    "gpt_3_5_summary": [d.page_content for d in doc_summary_gpt_3_5]
})

summaries_df.to_csv("data/summaries.csv", index=False)