# LangChain Chatbot - with Additional LLM-generated Metadata

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from functools import partial
from copy import deepcopy
import time

from langchain.document_loaders import UnstructuredHTMLLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

from openai.error import InvalidRequestError

pd.set_option('display.max_colwidth', None)

_ = load_dotenv(find_dotenv())

## 1. Load Data

In [2]:
documents = []
for file in os.listdir('data/test_documents'):
    if file.endswith('.pdf'):
        pdf_path = './data/test_documents/' + file
        print(f'Loading {pdf_path}')
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        doc_path = './data/test_documents/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())

Loading ./data/test_documents/Extension of Deadlines MMDS.html
Loading ./data/test_documents/Extension of Deadlines Master Business Infromatics.html
Loading ./data/test_documents/General Questions MMDS.html
Loading ./data/test_documents/General Questions Master Business Informatics.html
Loading ./data/test_documents/Learning Agreements MMDS.html
Loading ./data/test_documents/Learning Agreements Master Business Informatics.html
Loading ./data/test_documents/MMDS info start page.html
Loading ./data/test_documents/Master Business Informatics info start page.html
Loading ./data/test_documents/Master Thesis Information.html
Loading ./data/test_documents/Modue_Catalog_MSc_Wifo_23_24.pdf
Loading ./data/test_documents/Module_Catalog_Appendix_MMDS_23_24.pdf
Loading ./data/test_documents/Module_Catalog_MMDS_23_24.pdf
Loading ./data/test_documents/PO_MMDS_20.pdf
Loading ./data/test_documents/PO_MSc_Wifo_18.pdf
Loading ./data/test_documents/Recognition of Coursework and Examinations MMDS.html
Load

In [3]:
documents

[Document(page_content="Foto: Anna Logue\n\nWirtschafts\xadinformatik und Wirtschafts\xadmathematik\n\nStudium\n\nStudien\xadorganisation\n\nMannheim Master in Data\n\t\t\t\t\t\t\t\t\tScience\n\nExtension\n\t\t\t\t\t\t\t\t\tof Deadlines\n\nMannheim Master in Data Science\n\nGeneral\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tQuestions\n\nExtension of\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tDeadlines\n\nLearning\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tAgreements\n\nRecognition\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tof Coursework and Examinations\n\nExtension of deadlines\n\nIn which cases can I request a deadline extension?\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\tThe\n\t\t\t\t\t\t\t\t\t\t\t\t\t\texamination committee may decide to extend certain deadlines,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tprovided\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tthere is a valid reason why you are unable to meet the deadline\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tin\n\t\t\t\t\t\t\t\t\t

## 2. Add Metadata

- source
- tag every document either with **general** or with relevant **studyprograms**
- (summary for documents)

### 2.1 OpenAI Metadata Tagger on Each Document

In [4]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

metadata_prompt = ChatPromptTemplate.from_template(
    """Extract relevant information from the following document.
The document is related to the University of Mannheim. Some documents are only relevant to a specific \
study program, while others provide general information about the University of several study \
programs at once (use the "general" tag). If you think the document is relevant to a specific study \
program which is not in the list use the "other" tag.

{input}
"""
)

schema = {
    "properties": {
        "study_program": {
            "type": "string",
            "enum": [
                "B.Sc. Business Informatics",
                "M.Sc. Business Informatics",
                "B.Sc. Mathematics in Business and Economics",
                "M.Sc. Mathematics in Business and Economics",
                "Mannheim Master in Data Science",
                "general",
                "other"
            ],
            "description": "The study program this document is relevant to"
        },
        "short_description": {
            "type": "string",
            "description": "A short summary that describes what information can be found in this document in at most 3 sentences"
        },
    },
    "required": ["study_program", "short_description"]
}

document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm, prompt=metadata_prompt)

  warn_deprecated(


In [5]:
# apply the document transformer
# pass documents individually to handle exceptions
for i, d in enumerate(documents):
    print(f"### document {i} ###")
    try:
        enhanced_document = document_transformer.transform_documents([d])[0]
        for k, v in enhanced_document.metadata.items():
            print(f"{k}: {v}")
    except InvalidRequestError as e:
        print(f"Invalid Request Error ({e})")
        # wait to reduce amount of tokens sent to API per minute
        # this prevents RateLimitErrors
        time.sleep(15)
        
    print()

### document 0 ###
study_program: Mannheim Master in Data Science
short_description: The document provides information about deadline extensions, degree plans and course schedules, examination regulations and module catalog, examination committee, advisory service, and master's thesis for the Mannheim Master in Data Science program.
source: ./data/test_documents/Extension of Deadlines MMDS.html

### document 1 ###
study_program: M.Sc. Business Informatics
short_description: Information about deadline extensions and the master's program in Business Informatics.
source: ./data/test_documents/Extension of Deadlines Master Business Infromatics.html

### document 2 ###
study_program: Mannheim Master in Data Science
short_description: Information about module changes, team projects, studying abroad, degree plans, course schedules, examination regulations, examination committee, advisory service, and master's thesis.
source: ./data/test_documents/General Questions MMDS.html

### document 3 ##

Many documents are too long to be passed to the metadata tagger. In the next section long documents are first summarized before being passed to the metadata tagger.

### 2.2 OpenAI Metadata Tagger with Prior Document Summaries (for large documents)

If documents have more than a predefined number of tokens, they are summarized. The summarized documents and non-summarized (smaller) documents are then passed to the metadata tagger to generate metadata. The metadata of summarized documents is then combined with the original (un-summarized) document to conserve the original text data.

In [6]:
# Prompt and method for converting Document -> str.
document_prompt = PromptTemplate.from_template("{page_content}")
partial_format_document = partial(format_document, prompt=document_prompt)


# A text splitter that recursively splits a document into multiple chunks until
# the maximum chunck size is below a predefined value (without overlap).

def get_num_tokens_single_doc(doc):
    return llm.get_num_tokens(partial_format_document(doc))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 3600,
    chunk_overlap  = 0,
    length_function = llm.get_num_tokens,
    is_separator_regex = False,
)


# The chain we'll apply to each individual document.
# Returns a summary of the document.

map_chain = (
    {"context": partial_format_document}
    | PromptTemplate.from_template("Summarize this content:\n\n{context}")
    | llm
    | StrOutputParser()
)

# A wrapper chain to keep the original Document metadata
map_as_doc_chain = (
    RunnableParallel({"doc": RunnablePassthrough(), "content": map_chain})
    | (lambda x: Document(page_content=x["content"], metadata=x["doc"].metadata))
).with_config(run_name="Summarize (return doc)")


# The chain we'll repeatedly apply to collapse subsets of the documents
# into a consolidate document until the total token size of our
# documents is below some max size.

def format_docs(docs):
    return "\n\n".join(partial_format_document(doc) for doc in docs)

collapse_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Collapse this content:\n\n{context}")
    | llm
    | StrOutputParser()
)

def get_num_tokens(docs):
    return llm.get_num_tokens(format_docs(docs))

def collapse(
    docs,
    config,
    token_max=3600,
):
    collapse_ct = 1
    while get_num_tokens(docs) > token_max:
        config["run_name"] = f"Collapse {collapse_ct}"
        invoke = partial(collapse_chain.invoke, config=config)
        split_docs = split_list_of_docs(docs, get_num_tokens, token_max)
        docs = [collapse_docs(_docs, invoke) for _docs in split_docs]
        collapse_ct += 1
    return docs


# The chain we'll use to combine our individual document summaries
# (or summaries over subset of documents if we had to collapse the map results)
# into a final summary.

reduce_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Combine these summaries:\n\n{context}")
    | llm
    | StrOutputParser()
).with_config(run_name="Reduce")


# The final full chain for summarizing documents
map_reduce_summarizer = (text_splitter.split_documents | map_as_doc_chain.map() | collapse | reduce_chain).with_config(
    run_name="Map reduce"
)


def get_metadata(docs, metadata_tagger, summarizer, llm, max_tokens):
    document_prompt = PromptTemplate.from_template("{page_content}")
    partial_format_document = partial(format_document, prompt=document_prompt)
    docs_transformed = []
    for i, doc in enumerate(docs):
        print(f"Processing document {i}/{len(docs)-1}")
        # directly apply metadata tagger if number of tokens is below threshold
        if llm.get_num_tokens(partial_format_document(doc)) <= max_tokens:
            print(f"\tApply metadata tagger...")
            doc_w_metadata = metadata_tagger.transform_documents([doc])[0]
            docs_transformed.append(doc_w_metadata)
        # otherwise, summarize document first to get metadata
        else:
            print(f"\tSummarize document...")
            summary = summarizer.invoke([doc])
            summary_doc = Document(page_content=summary, metadata=doc.metadata)
            print(f"\tApply metadata tagger...")
            summary_w_metadata = metadata_tagger.transform_documents([summary_doc])[0]
            doc_w_metadata = Document(page_content=doc.page_content, metadata=summary_w_metadata.metadata)
            docs_transformed.append(doc_w_metadata)
    print("done!")
    return docs_transformed

In [7]:
documents_w_metadata = get_metadata(documents, metadata_tagger=document_transformer, summarizer=map_reduce_summarizer, llm=llm, max_tokens=3600)

Processing document 0/15
	Apply metadata tagger...
Processing document 1/15
	Apply metadata tagger...
Processing document 2/15
	Summarize document...
	Apply metadata tagger...
Processing document 3/15
	Apply metadata tagger...
Processing document 4/15
	Summarize document...
	Apply metadata tagger...
Processing document 5/15
	Apply metadata tagger...
Processing document 6/15
	Apply metadata tagger...
Processing document 7/15
	Apply metadata tagger...
Processing document 8/15
	Apply metadata tagger...
Processing document 9/15
	Summarize document...
	Apply metadata tagger...
Processing document 10/15
	Apply metadata tagger...
Processing document 11/15
	Summarize document...
	Apply metadata tagger...
Processing document 12/15
	Summarize document...
	Apply metadata tagger...
Processing document 13/15
	Summarize document...
	Apply metadata tagger...
Processing document 14/15
	Summarize document...
	Apply metadata tagger...
Processing document 15/15
	Apply metadata tagger...
done!


In [8]:
for i, d in enumerate(documents_w_metadata):
    print(f"### document {i} ###")
    for k, v in d.metadata.items():
        print(f"{k}: {v}")
    print()

### document 0 ###
study_program: Mannheim Master in Data Science
short_description: Information about deadline extensions and degree plans for the Mannheim Master in Data Science program.
source: ./data/test_documents/Extension of Deadlines MMDS.html

### document 1 ###
study_program: M.Sc. Business Informatics
short_description: Information about deadline extensions and the master's program in Business Informatics.
source: ./data/test_documents/Extension of Deadlines Master Business Infromatics.html

### document 2 ###
study_program: Mannheim Master in Data Science
short_description: This document provides information about the Mannheim Master in Data Science program, including study organization, deadlines, learning agreements, recognition of coursework and examinations, module changes, team projects, completing exams before the master's thesis, studying abroad, degree plans and course schedules, contact information for the examination committee and student advisory service, finding

### Does the source metadata influence the predicted study program?

The source metadata could influence the predicted study program because almost all test documents have filenames which indicate the study program they belong to. If that is the case, the metadata tagger might perform worse outside of this test scenario where files might have meaningless names. The following code tests if the metadata tagger still works with meaningless filenames.

In [9]:
# documens with meaningless source metadata
documents_no_source = deepcopy(documents)
for i, doc in enumerate(documents_no_source):
    file_ending = doc.metadata["source"].split(".")[-1]
    doc.metadata = {"source": f"./data/test_documents/{i}.{file_ending}"}

for i, doc in enumerate(documents_no_source):
    print(doc.metadata)

{'source': './data/test_documents/0.html'}
{'source': './data/test_documents/1.html'}
{'source': './data/test_documents/2.html'}
{'source': './data/test_documents/3.html'}
{'source': './data/test_documents/4.html'}
{'source': './data/test_documents/5.html'}
{'source': './data/test_documents/6.html'}
{'source': './data/test_documents/7.html'}
{'source': './data/test_documents/8.html'}
{'source': './data/test_documents/9.pdf'}
{'source': './data/test_documents/10.pdf'}
{'source': './data/test_documents/11.pdf'}
{'source': './data/test_documents/12.pdf'}
{'source': './data/test_documents/13.pdf'}
{'source': './data/test_documents/14.html'}
{'source': './data/test_documents/15.html'}


In [10]:
documents_w_metadata_no_source = get_metadata(documents_no_source, metadata_tagger=document_transformer, summarizer=map_reduce_summarizer, llm=llm, max_tokens=3600)

Processing document 0/15
	Apply metadata tagger...
Processing document 1/15
	Apply metadata tagger...
Processing document 2/15
	Summarize document...
	Apply metadata tagger...
Processing document 3/15
	Apply metadata tagger...
Processing document 4/15
	Summarize document...
	Apply metadata tagger...
Processing document 5/15
	Apply metadata tagger...
Processing document 6/15
	Apply metadata tagger...
Processing document 7/15
	Apply metadata tagger...
Processing document 8/15
	Apply metadata tagger...
Processing document 9/15
	Summarize document...
	Apply metadata tagger...
Processing document 10/15
	Apply metadata tagger...
Processing document 11/15
	Summarize document...
	Apply metadata tagger...
Processing document 12/15
	Summarize document...
	Apply metadata tagger...
Processing document 13/15
	Summarize document...
	Apply metadata tagger...
Processing document 14/15
	Summarize document...
	Apply metadata tagger...
Processing document 15/15
	Apply metadata tagger...
done!


In [11]:
for i, d in enumerate(documents_w_metadata_no_source):
    print(f"### document {i} ###")
    for k, v in d.metadata.items():
        print(f"{k}: {v}")
    print()

### document 0 ###
study_program: Mannheim Master in Data Science
short_description: The document provides information about deadline extensions, degree plans and course schedules, examination regulations and module catalog, examination committee, advisory service, and master's thesis for the Mannheim Master in Data Science program.
source: ./data/test_documents/0.html

### document 1 ###
study_program: M.Sc. Business Informatics
short_description: Information about deadline extensions and the master's program in Business Informatics.
source: ./data/test_documents/1.html

### document 2 ###
study_program: Mannheim Master in Data Science
short_description: This document provides information about the Mannheim Master in Data Science program, including study organization, deadlines, learning agreements, recognition of coursework and examinations, module changes, team projects, completing exams before the master's thesis, studying abroad, degree plans and course schedules, contact informat

**Result:** The source metadata does not seem to influence the predicted study program, as all predictions are the same as before.