# Combine Docs

PermChain is a great choice for implementating workflows that involve operating over longer documents because of its recursive nature

In [11]:
from operator import itemgetter

from langchain.chat_models.openai import ChatOpenAI
from langchain.prompts import (
    SystemMessagePromptTemplate,
    ChatPromptTemplate,
    PromptTemplate,
)
from langchain.schema.output_parser import StrOutputParser
from langchain.runnables.openai_functions import OpenAIFunctionsRouter
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain.schema.document import Document
from langchain.schema import format_document

from permchain import Pregel, channels

## Stuff Documents

Stuff documents is simple - just a chain

In [2]:
from langchain.schema.runnable import RunnableLambda

In [3]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

_combine_documents = RunnableLambda(
    lambda x: format_document(x, DEFAULT_DOCUMENT_PROMPT)
).map() | (lambda x: "\n\n".join(x))

In [4]:
docs = [
    Document(page_content="Harrison used to work at Kensho"),
    Document(page_content="Ankush worked at Facebook"),
]

In [5]:
stuff_chain = (
    {
        "question": lambda x: x["question"],
        "context": (lambda x: x["docs"]) | _combine_documents,
    }
    | ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Answer user questions based on the following documents:\n\n{context}",
            ),
            ("human", "{question}"),
        ]
    )
    | ChatOpenAI()
)

In [6]:
stuff_chain.invoke({"question": "where did harrison work", "docs": docs})

AIMessage(content='Harrison used to work at Kensho.')

## Reduce Documents

Reduce documents tries to merge documents recursively.

In [7]:
many_docs = docs * 5

In [8]:
def _split_list_of_docs(docs, max_length=70):
    new_result_doc_list = []
    _sub_result_docs = []
    for doc in docs:
        _sub_result_docs.append(doc)
        _num_tokens = sum([len(d.page_content) for d in _sub_result_docs])
        if _num_tokens > max_length:
            if len(_sub_result_docs) == 1:
                raise ValueError(
                    "A single document was longer than the context length,"
                    " we cannot handle this."
                )
            new_result_doc_list.append(_sub_result_docs[:-1])
            _sub_result_docs = _sub_result_docs[-1:]
    new_result_doc_list.append(_sub_result_docs)
    return new_result_doc_list

In [9]:
# Just to show what its like split
split_docs = _split_list_of_docs(many_docs)
split_docs

[[Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')],
 [Document(page_content='Harrison used to work at Kensho'),
  Document(page_content='Ankush worked at Facebook')]]

In [10]:
input_inbox = channels.LastValue[str]("input_inbox")
reduce_inbox = channels.LastValue[str]("reduce_inbox")
collapse_inbox = channels.LastValue[str]("collapse_inbox")
output_inbox = channels.LastValue[str]("output_inbox")

TypeError: LastValue() takes no arguments

In [22]:
# Decide if should finish or should reduce one more step
def decide_end(plan):
    if len(plan["docs"]) > 1:
        return Pregel.send_to("reduce_inbox")
    else:
        return stuff_chain | Pregel.send_to("output_inbox")


# Chain that collapses documents then chooses end
collapse_chain = (
    Pregel.subscribe_to(docs=collapse_inbox, question="question")
    | RunnablePassthrough.assign(docs=lambda x: _split_list_of_docs(x["docs"]))
    | decide_end
)


reduce_chain = (
    Pregel.subscribe_to(input=input_inbox)
    | (lambda x: [{"docs": d, "question": x["question"]} for d in x["docs"]])
    | stuff_chain.map()
    | Pregel.send_to(
        {
            "collapse_inbox": {
                "docs": lambda x: [Document(page_content=m.content) for m in x],
            }
        }
    )
)

In [23]:
pubsub = Pregel(
    input_inbox, reduce_inbox, collapse_inbox, input=input_inbox, output=output_inbox
)

ValidationError: 6 validation errors for Pregel
processes -> 0
  value is not a valid dict (type=type_error.dict)
processes -> 0
  value is not a valid dict (type=type_error.dict)
processes -> 1
  value is not a valid dict (type=type_error.dict)
processes -> 1
  value is not a valid dict (type=type_error.dict)
processes -> 2
  value is not a valid dict (type=type_error.dict)
processes -> 2
  value is not a valid dict (type=type_error.dict)

In [101]:
reduce_agent.invoke({"question": "where did harrison work", "docs": many_docs})

[AIMessage(content='Harrison used to work at Kensho.', additional_kwargs={}, example=False)]