## StuffDocumentsChain

In [125]:
from langchain.chains import StuffDocumentsChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

# This controls how each document will be formatted. Specifically,
# it will be passed to `format_document` - see that function for more
# details.
document_prompt = PromptTemplate.from_template("Page {page}\n{page_content}")
document_variable_name = "context"
llm = OpenAI()
# The prompt here should take as an input variable the
# `document_variable_name`
prompt = PromptTemplate.from_template("Summarize this content: {context}")
llm_chain = LLMChain(llm=llm, prompt=prompt)
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
    verbose=True,
)

In [182]:
# Generate langchain Document with dummy text.
with open("../data/state_of_the_union.txt", "r") as f:
    DUMMY_TXT = ""
    for line in f.readlines():
        if line.strip() != "":
            DUMMY_TXT += line

import numpy as np
from langchain.schema import Document


def lc_doc_generator(n, txt, ws=2000):
    cnt, start, end = 0, 0, ws
    doc_size = len(txt)

    while (cnt < n) and (start <= doc_size):
        doc = Document(page_content=txt[start:end], metadata={"page": cnt})
        cnt += 1
        start = end
        end += ws
        yield doc


lc_doc_generator = lc_doc_generator(n=np.inf, txt=DUMMY_TXT)
docs = [doc for doc in lc_doc_generator]

In [128]:
from typing import List, Any
from langchain.schema import format_document

## --------------------- ##
## Way that BaseCombineDocumentsChain format documents
print("## --------------------- ##")
# input_keys of BaseCombineDocumentsChain
print(chain.input_keys)

doc = docs[0]
document_prompt = PromptTemplate.from_template("Page {page}\n{page_content}")
print(format_document(doc, prompt=document_prompt))

print("## --------------------- ##")


## Way that StuffDocumentChain get inputs and format the final prompt
# _get_inputs of StuffDocumentsChain
def _get_inputs(chain, docs: List[Document], **kwargs: Any) -> dict:
    """Construct inputs from kwargs and docs.

    Format and the join all the documents together into one input with name
    `chain.document_variable_name`. The pluck any additional variables
    from **kwargs.

    Args:
        docs: List of documents to format and then join into single input
        **kwargs: additional inputs to chain, will pluck any other required
            arguments from here.

    Returns:
        dictionary of inputs to LLMChain
    """
    # Format each document according to the prompt
    doc_strings = [format_document(doc, chain.document_prompt) for doc in docs]
    # Join the documents together to put them in the prompt.
    inputs = {
        k: v for k, v in kwargs.items() if k in chain.llm_chain.prompt.input_variables
    }
    inputs[chain.document_variable_name] = chain.document_separator.join(doc_strings)
    return inputs


print(f"Keys: {_get_inputs(chain, docs).keys()}", end="\n\n")
print(_get_inputs(chain, docs)["context"])

print("## --------------------- ##")
print(chain.output_keys, chain.input_keys)
print(chain.run({"input_documents": docs}))

## --------------------- ##
['input_documents']
Page 0
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  
Last year COVID-19 kept us apart. This year we are finally together again. 
Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. 
With a duty to one another to the American people to the Constitution. 
And with an unwavering resolve that freedom will always triumph over tyranny. 
Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. 
He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. 
He met the Ukrainian people. 
From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. 
Groups of ci

## ReduceDocumentsChain

- What if `prompt` exceeds the token_max at the final llm_chain?
- Set the maximum number of tokens to group documents into. For example, if set to 3000 then documents will be grouped into chunks of no greater than 3000 tokens before trying to combine them into a smaller chunk.
    
    - `_collapse` & `_split_list_of_docs` provides hints for dealing with the situation.
    
    - `collapse_documents_chain` is used if the documents passed in are too many to all be passed to `combine_documents_chain` in one go. In this case, `collapse_documents_chain` is called recursively on as big of groups of documents as are allowed.
    
    - Premise:
        - A single `Document` should not exceed the token_max.
        - `llm` must be supported by langchain, and has `get_num_tokens` method to count tokens.
        - It can be used with `BaseCombineDocumentsChain` - `prompt_length` method.
    

In [209]:
from langchain.chains import StuffDocumentsChain, LLMChain, ReduceDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

# This controls how each document will be formatted. Specifically,
# it will be passed to `format_document` - see that function for more
# details.
document_prompt = PromptTemplate.from_template("Page {page}\n{page_content}")
document_variable_name = "context"
llm = OpenAI(temperature=0)
# The prompt here should take as an input variable the
# `document_variable_name`
prompt = PromptTemplate.from_template("Summarize this content: {context}")
llm_chain = LLMChain(llm=llm, prompt=prompt)
combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
)
# Version.1: without specifying collapse chain
# if no callapse chain, it use combine_documents_chain.
chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
)

In [196]:
# Version.2: with specifying collapse chain
prompt = PromptTemplate.from_template("Collapse this content: {context}")
llm_chain = LLMChain(llm=llm, prompt=prompt)
collapse_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
)
chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=collapse_documents_chain,
)

In [197]:
from typing import List, Optional, Any, Tuple, Callable, Protocol
from langchain.callbacks.manager import Callbacks


def _split_list_of_docs(
    docs: List[Document], length_func: Callable, token_max: int, **kwargs: Any
) -> List[List[Document]]:
    new_result_doc_list = []
    _sub_result_docs = []
    for doc in docs:
        _sub_result_docs.append(doc)
        _num_tokens = length_func(_sub_result_docs, **kwargs)
        if _num_tokens > token_max:
            if len(_sub_result_docs) == 1:
                raise ValueError(
                    "A single document was longer than the context length,"
                    " we cannot handle this."
                )
            new_result_doc_list.append(_sub_result_docs[:-1])
            _sub_result_docs = _sub_result_docs[-1:]
    new_result_doc_list.append(_sub_result_docs)
    return new_result_doc_list


## prepare parameters
_docs = docs[-3:]
length_func = chain.combine_documents_chain.prompt_length

## make use of the token_max of the model as efficiently as possible
new_docs = _split_list_of_docs(_docs, length_func, token_max=600)
print(len(_docs), len(new_docs))

3 2


In [210]:
class CombineDocsProtocol(Protocol):
    """Interface for the combine_docs method."""

    def __call__(self, docs: List[Document], **kwargs: Any) -> str:
        """Interface for the combine_docs method."""


def _collapse_docs(
    docs: List[Document],
    combine_document_func: CombineDocsProtocol,
    **kwargs: Any,
) -> Document:
    result = combine_document_func(docs, **kwargs)
    combined_metadata = {k: str(v) for k, v in docs[0].metadata.items()}
    for doc in docs[1:]:
        for k, v in doc.metadata.items():
            if k in combined_metadata:
                combined_metadata[k] += f", {v}"
            else:
                combined_metadata[k] = str(v)
    return Document(page_content=result, metadata=combined_metadata)


def _collapse(
    chain,
    docs: List[Document],
    token_max: Optional[int] = None,
    callbacks: Callbacks = None,
    **kwargs: Any,
) -> Tuple[List[Document], dict]:
    result_docs = docs
    length_func = chain.combine_documents_chain.prompt_length
    num_tokens = length_func(result_docs, **kwargs)

    def _collapse_docs_func(docs: List[Document], **kwargs: Any) -> str:
        return chain._collapse_chain.run(
            input_documents=docs, callbacks=callbacks, **kwargs
        )

    _token_max = token_max or chain.token_max
    while num_tokens is not None and num_tokens > _token_max:
        new_result_doc_list = _split_list_of_docs(
            result_docs, length_func, _token_max, **kwargs
        )
        result_docs = []
        for docs in new_result_doc_list:
            print(docs)
            new_doc = _collapse_docs(docs, _collapse_docs_func, **kwargs)
            result_docs.append(new_doc)
        num_tokens = length_func(result_docs, **kwargs)
    return result_docs, {}


_docs = docs[-3:]
new_docs, _ = _collapse(chain, _docs, token_max=600)

[Document(page_content='ngers. \nOne was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \nWhen they came home, many of the world’s fittest and best trained warriors were never the same. \nHeadaches. Numbness. Dizziness. \nA cancer that would put them in a flag-draped coffin. \nI know. \nOne of those soldiers was my son Major Beau Biden. \nWe don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \nBut I’m committed to finding out everything we can. \nCommitted to military families like Danielle Robinson from Ohio. \nThe widow of Sergeant First Class Heath Robinson.  \nHe was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \nStationed near Baghdad, just yards from burn pits the size of football fields. \nHeath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved buildin

In [213]:
## Finally, put new_docs into the combine_documents_chain
chain.combine_documents_chain.combine_docs(docs=new_docs)

("\n\nJoe Biden is discussing the effects of burn pits on military personnel and the story of Danielle Robinson, whose husband Heath was exposed to burn pits. He is calling on Congress to pass a law to provide veterans affected by toxic exposures in Iraq and Afghanistan with the benefits and health care they deserve. He is also announcing a $10 billion commitment to the Cancer Moonshot to end cancer as well as funding for ARPA-H, an Advanced Research Projects Agency for Health, to drive breakthroughs in cancer, Alzheimer's, diabetes, and more. He believes that the United States of America is the only nation that can turn every crisis into an opportunity and that the State of the Union is strong.",
 {})

## MapReduceDocumentsChain

In [None]:
from langchain.chains import (
    StuffDocumentsChain,
    LLMChain,
    ReduceDocumentsChain,
    MapReduceDocumentsChain,
)
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

# This controls how each document will be formatted. Specifically,
# it will be passed to `format_document` - see that function for more
# details.
document_prompt = PromptTemplate.from_template("Page {page}\n{page_content}")
document_variable_name = "context"
llm = OpenAI()
# The prompt here should take as an input variable the
# `document_variable_name`
prompt = PromptTemplate.from_template("Summarize this content: {context}")
llm_chain = LLMChain(llm=llm, prompt=prompt)
# We now define how to combine these summaries
reduce_prompt = PromptTemplate.from_template("Combine these summaries: {context}")
reduce_llm_chain = LLMChain(llm=llm, prompt=reduce_prompt)
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
)
reduce_documents_chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
)
chain = MapReduceDocumentsChain(
    llm_chain=llm_chain,
    reduce_documents_chain=reduce_documents_chain,
)