In [None]:
!pip install openai tiktoken chromadb langchain

In [2]:
# https://python.langchain.com/docs/modules/chains/document/map_reduce

In [4]:
# get a token: https://platform.openai.com/account/api-keys

from getpass import getpass

OPENAI_API_KEY = getpass()

import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

··········


In [5]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader('https://en.wikipedia.org/wiki/Ethos') #("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview")

In [52]:
# testing splitter if needed for chunking a document into smaller parts
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
# text_splitter = CharacterTextSplitter(
#     separator = "\n\n",
#     chunk_size = 1000,
#     chunk_overlap  = 200,
#     length_function = len,
#     is_separator_regex = False,
# )

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
    separators=[".", ",", "\n"]
)

In [62]:
from functools import partial

from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

# Prompt and method for converting Document -> str.
document_prompt = PromptTemplate.from_template("{page_content}")
partial_format_document = partial(format_document, prompt=document_prompt)

# The chain we'll apply to each individual document.
# Returns a summary of the document.
map_chain = (
    {"context": partial_format_document}
    | PromptTemplate.from_template("""Provide a summary of the following text.
                  TEXT: {context}
                  SUMMARY:""") #("Summarize this content:\n\n{context}")
    | llm
    | StrOutputParser()
)

# A wrapper chain to keep the original Document metadata
map_as_doc_chain = (
    RunnableParallel({"doc": RunnablePassthrough(), "content": map_chain})
    | (lambda x: Document(page_content=x["content"], metadata=x["doc"].metadata)) # Document(page_content=x, metadata=x)
).with_config(run_name="Summarize (return doc)")

# The chain we'll repeatedly apply to collapse subsets of the documents
# into a consolidate document until the total token size of our
# documents is below some max size.
def format_docs(docs):
    return "\n\n".join(partial_format_document(doc) for doc in docs)


collapse_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Collapse this content:\n\n{context}")
    | llm
    | StrOutputParser()
)


def get_num_tokens(docs):
    return llm.get_num_tokens(format_docs(docs))


def collapse(
    docs,
    config,
    token_max=8000,
):
    collapse_ct = 1
    while get_num_tokens(docs) > token_max:
        config["run_name"] = f"Collapse {collapse_ct}"
        invoke = partial(collapse_chain.invoke, config=config)
        split_docs = split_list_of_docs(docs, get_num_tokens, token_max)
        docs = [collapse_docs(_docs, invoke) for _docs in split_docs]
        collapse_ct += 1
    return docs

# The chain we'll use to combine our individual document summaries
# (or summaries over subset of documents if we had to collapse the map results)
# into a final summary.

reduce_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template('''Write a concise summary of the following text delimited by triple backquotes. Focus on the main content of the paragraph, not on the style or the structure.
                      ```{context}```
                      SUMMARY:''') #("Combine these summaries:\n\n{context}")
    | llm
    | StrOutputParser()
).with_config(run_name="Reduce")

# # The final full chain as shown in the documentation
# map_reduce_with_collapse = (map_as_doc_chain.map() | collapse | reduce_chain).with_config(
#     run_name="Map reduce"
# )

# The final full chain without collapse, I do not want to collapse
map_reduce = (map_as_doc_chain.map() | reduce_chain).with_config(
    run_name="Map reduce"
)

In [21]:
from langchain.schema import Document
loader = WebBaseLoader('https://en.wikipedia.org/wiki/Ethos') #("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

In [55]:
splits = text_splitter.split_text(docs[0].page_content)
new_docs = text_splitter.create_documents(splits) # https://stackoverflow.com/a/77464710
# https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/document-summarization/summarization_large_documents_langchain.ipynb
len(new_docs)

43

In [63]:
# print(map_reduce.invoke(docs[0:1], config={"max_concurrency": 5}))

print(map_reduce.invoke(new_docs, config={"max_concurrency": 5}))

The text is a detailed description of a Wikipedia page about "Ethos," a Greek term meaning 'character' that refers to the guiding beliefs or ideals of a community, nation, or ideology. The page includes navigation options, sections for contributing to Wikipedia, and language links. The content covers the etymology and origin of "ethos," its use in rhetoric as one of Aristotle's three modes of persuasion, and its significance in various contexts such as music, politics, and public life. The page also discusses the portrayal of character in Greek tragedy, the concept of ethos in visual arts, and modern interpretations of ethos in rhetoric, including feminist perspectives and the influence of cultural values. Additionally, the text lists notable rhetoricians, influential works on rhetoric, and references to academic works on related topics. The page may require structural improvements and has metadata, categories, and maintenance tags indicating its status and content licensing.
