# LangChain Chatbot - with Additional LLM-generated Metadata

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from functools import partial
from copy import deepcopy

from langchain.document_loaders import UnstructuredHTMLLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

from openai.error import InvalidRequestError

pd.set_option('display.max_colwidth', None)

_ = load_dotenv(find_dotenv())

## Load Data

In [None]:
documents = []
for file in os.listdir('data/test_documents'):
    if file.endswith('.pdf'):
        pdf_path = './data/test_documents/' + file
        print(f'Loading {pdf_path}')
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        doc_path = './data/test_documents/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())

In [None]:
documents

## Add Metadata

- source
- tag every document either with **general** or with relevant **studyprograms**
- (summary for documents)

In [None]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

metadata_prompt = ChatPromptTemplate.from_template(
    """Extract relevant information from the following document.
The document is related to the University of Mannheim. Some documents are only relevant to a specific \
study program, while others provide general information about the University of several study \
programs at once (use the "general" tag). If you think the document is relevant to a specific study \
program which is not in the list use the "other" tag.

{input}
"""
)

schema = {
    "properties": {
        "study_program": {
            "type": "string",
            "enum": [
                "B.Sc. Business Informatics",
                "M.Sc. Business Informatics",
                "B.Sc. Mathematics in Business and Economics",
                "M.Sc. Mathematics in Business and Economics",
                "Mannheim Master in Data Science",
                "general",
                "other"
            ],
            "description": "The study program this document is relevant to"
        },
        "short_description": {
            "type": "string",
            "description": "A short summary that describes what information can be found in this document in at most 3 sentences"
        },
    },
    "required": ["study_program", "short_description"]
}

document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm, prompt=metadata_prompt)

In [None]:
# Prompt and method for converting Document -> str.
document_prompt = PromptTemplate.from_template("{page_content}")
partial_format_document = partial(format_document, prompt=document_prompt)


# A text splitter that recursively splits a document into multiple chunks until
# the maximum chunck size is below a predefined value (without overlap).

def get_num_tokens_single_doc(doc):
    return llm.get_num_tokens(partial_format_document(doc))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 3600,
    chunk_overlap  = 0,
    length_function = llm.get_num_tokens,
    is_separator_regex = False,
)


# The chain we'll apply to each individual document.
# Returns a summary of the document.

map_chain = (
    {"context": partial_format_document}
    | PromptTemplate.from_template("Summarize this content:\n\n{context}")
    | llm
    | StrOutputParser()
)

# A wrapper chain to keep the original Document metadata
map_as_doc_chain = (
    RunnableParallel({"doc": RunnablePassthrough(), "content": map_chain})
    | (lambda x: Document(page_content=x["content"], metadata=x["doc"].metadata))
).with_config(run_name="Summarize (return doc)")


# The chain we'll repeatedly apply to collapse subsets of the documents
# into a consolidate document until the total token size of our
# documents is below some max size.

def format_docs(docs):
    return "\n\n".join(partial_format_document(doc) for doc in docs)

collapse_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Collapse this content:\n\n{context}")
    | llm
    | StrOutputParser()
)

def get_num_tokens(docs):
    return llm.get_num_tokens(format_docs(docs))

def collapse(
    docs,
    config,
    token_max=3600,
):
    collapse_ct = 1
    while get_num_tokens(docs) > token_max:
        config["run_name"] = f"Collapse {collapse_ct}"
        invoke = partial(collapse_chain.invoke, config=config)
        split_docs = split_list_of_docs(docs, get_num_tokens, token_max)
        docs = [collapse_docs(_docs, invoke) for _docs in split_docs]
        collapse_ct += 1
    return docs


# The chain we'll use to combine our individual document summaries
# (or summaries over subset of documents if we had to collapse the map results)
# into a final summary.

reduce_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Combine these summaries:\n\n{context}")
    | llm
    | StrOutputParser()
).with_config(run_name="Reduce")


# The final full chain for summarizing documents
map_reduce_summarizer = (text_splitter.split_documents | map_as_doc_chain.map() | collapse | reduce_chain).with_config(
    run_name="Map reduce"
)


def get_metadata(docs, metadata_tagger, summarizer, llm, max_tokens):
    document_prompt = PromptTemplate.from_template("{page_content}")
    partial_format_document = partial(format_document, prompt=document_prompt)
    docs_transformed = []
    for i, doc in enumerate(docs):
        print(f"Processing document {i}/{len(docs)-1}")
        # directly apply metadata tagger if number of tokens is below threshold
        if llm.get_num_tokens(partial_format_document(doc)) <= max_tokens:
            print(f"\tApply metadata tagger...")
            doc_w_metadata = metadata_tagger.transform_documents([doc])[0]
            docs_transformed.append(doc_w_metadata)
        # otherwise, summarize document first to get metadata
        else:
            print(f"\tSummarize document...")
            summary = summarizer.invoke([doc])
            summary_doc = Document(page_content=summary, metadata=doc.metadata)
            print(f"\tApply metadata tagger...")
            summary_w_metadata = metadata_tagger.transform_documents([summary_doc])[0]
            doc_w_metadata = Document(page_content=doc.page_content, metadata=summary_w_metadata.metadata)
            docs_transformed.append(doc_w_metadata)
    print("done!")
    return docs_transformed

In [None]:
documents_w_metadata = get_metadata(documents, metadata_tagger=document_transformer, summarizer=map_reduce_summarizer, llm=llm, max_tokens=3600)

In [None]:
for i, d in enumerate(documents_w_metadata):
    print(f"### document {i} ###")
    for k, v in d.metadata.items():
        print(f"{k}: {v}")
    print()

## Split Documents and Create Embeddings

In [None]:
# split documents into text chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunked_documents = text_splitter.split_documents(documents_w_metadata)

In [None]:
# create chroma vector db with OpenAIEmbeddings

vectordb = Chroma.from_documents(
  chunked_documents,
  embedding=OpenAIEmbeddings(),
  persist_directory='./storage_langchain_w_metadata'
)
vectordb.persist()

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model='gpt-3.5-turbo'),
    retriever=vectordb.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True
)

## Read Questions and Answer

In [None]:
df_questions = pd.read_csv('TestQuestions.csv', delimiter=";", names=["Question", "Response"] )
questions = df_questions["Question"]

responses = []
counter = 0
for q in questions:
    print(f'q{counter} start')
    r = qa_chain({'query': q})['result']
    responses.append((q, r))
    print(f'q{counter} end')
    counter += 1

df_responses = pd.DataFrame(responses, columns=["Question", "Response"])

In [None]:
df_responses

### Save Responses

In [None]:
df_responses.to_csv("test_responses_by_langchain_w_metadata.csv", sep=";")