# Combine Docs

PermChain is a great choice for implementating workflows that involve operating over longer documents because of its recursive nature

In [1]:
from langchain.chat_models.openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import Runnable, RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.document import Document
from langchain.schema import format_document

from permchain import Channel, Pregel, PregelRead
from permchain.channels import LastValue, Inbox

## Stuff Documents

Stuff documents is simple - just a chain

In [2]:
from langchain.schema.runnable import RunnableLambda

In [3]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

_combine_documents = RunnableLambda(
    lambda x: format_document(x, DEFAULT_DOCUMENT_PROMPT)
).map() | (lambda x: "\n\n".join(x))

In [4]:
docs = [
    Document(page_content="Harrison used to work at Kensho"),
    Document(page_content="Ankush worked at Facebook"),
]

In [5]:
stuff_chain = (
    {
        "question": lambda x: x["question"],
        "context": (lambda x: x["docs"]) | _combine_documents,
    }
    | ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Answer user questions based on the following documents:\n\n{context}",
            ),
            ("human", "{question}"),
        ]
    )
    | ChatOpenAI()
    | StrOutputParser()
)

In [6]:
stuff_chain.invoke({"question": "where did harrison work", "docs": docs})

'Harrison used to work at Kensho.'

## Reduce Documents

Reduce documents tries to merge documents recursively.

In [7]:
many_docs = docs * 5

In [8]:
def _split_list_of_docs(docs, max_length=70):
    new_result_doc_list = []
    _sub_result_docs = []
    for doc in docs:
        _sub_result_docs.append(doc)
        _num_tokens = sum([len(d.page_content) for d in _sub_result_docs])
        if _num_tokens > max_length:
            if len(_sub_result_docs) == 1:
                raise ValueError(
                    "A single document was longer than the context length,"
                    " we cannot handle this."
                )
            new_result_doc_list.append(_sub_result_docs[:-1])
            _sub_result_docs = _sub_result_docs[-1:]
    new_result_doc_list.append(_sub_result_docs)
    return new_result_doc_list

In [9]:
# Just to show what its like split
split_docs = _split_list_of_docs(many_docs)
split_docs

[[Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')]]

In [10]:
channels = {
    # input
    "question": LastValue(str),
    "docs": Inbox(Document),
    # intermediate
    "docs_to_finalize": Inbox(Document),
    # output
    "answer": LastValue(str),
}

In [11]:
def decide(docs: list[Document]) -> Runnable:
    if len(_split_list_of_docs(docs)) > 1:
        # send back to the beginning if we still need to collapse more
        return Channel.write_to("docs")
    else:
        # send to the finalizer if we're ready to produce final answer
        return Channel.write_to("docs_to_finalize")


collapse = (
    Channel.subscribe_to("docs")
    | _split_list_of_docs
    | {"docs_list": RunnablePassthrough(), "question": PregelRead("question")}
    # {docs: list[list[Doc]], question: str} -> list[{docs: list[Doc], question: str}]
    | (lambda x: [{"docs": docs, "question": x["question"]} for docs in x["docs_list"]])
    | stuff_chain.map()  # Collapse each list of docs to a single string
    | (lambda x: [Document(page_content=s) for s in x])  # A new (smaller) list of docs
    | decide
)

# Convert final set of docs to an answer
finalize = (
    Channel.subscribe_to("docs_to_finalize", key="docs").join(["question"])
    | stuff_chain
    | Channel.write_to("answer")
)

In [12]:
reduce_chain = Pregel(
    chains={
        "collapse": collapse,
        "finalize": finalize,
    },
    channels=channels,
    input=["question", "docs"],
    output="answer",
    debug=True,
)

In [13]:
reduce_chain.invoke({"question": "where did harrison work", "docs": many_docs})

[36;1m[1;3m[pregel/step][0m [1mStarting step 0 with 1 task. Next tasks:
[0m- collapse((Document(page_content='Harrison used to work at Kensho'),
 Document(page_content='Ankush worked at Facebook'),
 Document(page_content='Harrison used to work at Kensho'),
 Document(page_content='Ankush worked at Facebook'),
 Document(page_content='Harrison used to work at Kensho'),
 Document(page_content='Ankush worked at Facebook'),
 Document(page_content='Harrison used to work at Kensho'),
 Document(page_content='Ankush worked at Facebook'),
 Document(page_content='Harrison used to work at Kensho'),
 Document(page_content='Ankush worked at Facebook')))
[36;1m[1;3m[pregel/checkpoint][0m [1mFinishing step 0. Channel values:
[0m{'docs': (...), 'question': 'where did harrison work'}
[36;1m[1;3m[pregel/step][0m [1mStarting step 1 with 1 task. Next tasks:
[0m- collapse((Document(page_content='Harrison used to work at Kensho.'),
 Document(page_content='Harrison used to work at Kensho.'),
 Do

'Harrison worked at Kensho.'