# QA over Documents

In [1]:
# import basic openai keys
import os
import sys

import openai
from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv())

openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_version = os.environ["OPENAI_API_VERSION"]
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_type = os.environ["OPENAI_API_TYPE"]

In [5]:
# load source markdown material
from langchain.document_loaders import UnstructuredMarkdownLoader

source_file_path = "./document.md"
loader = UnstructuredMarkdownLoader(
    source_file_path,
    mode="elements",
    strategy="fast",
)
raw_markdown_docs = loader.load()

In [6]:
# split docs to token chunks
# config chunk_size & chunk_overlap need to tune in the following debugging
from langchain.text_splitter import TokenTextSplitter

chunk_size = 200
chunk_overlap = 40

token_spliter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
token_chunk_docs = token_spliter.split_documents(raw_markdown_docs)
print(
    f"chunk_size: {chunk_size}\n\
chunk_overlap: {chunk_overlap}\n\n\
result chunks count: {len(token_chunk_docs)}"
)

chunk_size: 200
chunk_overlap: 40

result chunks count: 181


In [7]:
# save chunks as vector into db
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

persist_directory = "./chroma/"
embeddings = OpenAIEmbeddings(deployment="embedding", chunk_size=16)

vectorstore = Chroma.from_documents(
    documents=token_chunk_docs,
    embedding=embeddings,
    persist_directory=persist_directory,
)
print(
    f"Chroma vectordb successfully created, its collection count: {vectorstore._collection.count()}"
)

# test the vectordb query function
test_question = "Does the gcc tenant support co-organizer attendace report?"
res_docs = vectorstore.similarity_search(test_question, k=5)
print(res_docs[0])

# save again, this is not necessary, due to we haven't change db after init
vectorstore.persist()

Chroma vectordb successfully created, its collection count: 607
page_content="] The coorganizer thread property is overrided. workaround: 1. Remove the co-organzier then save 2. Add the co-organzier back then save 3. co-organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to couldn't set coorganizerid thread property successfully.\n| Customer could always see the entrance of in meeting attendance reports    | inmeetingreport || [ByDesign]Those tenants can always use in-meeting report feature regardless of the policy value, tenant list please see [Reference]   |||\n| Customer couldn't find channel meeting's report download chiclet from its post comments    | channelmeetingchiclet || [Limitation]Channel 2.0 team no longer provides report control messages after" metadata={'category': 'NarrativeText', 'file_directory': '.', '

In [5]:
# load the exist local vectorstore
persist_directory = "./chroma/"
embeddings = OpenAIEmbeddings(deployment="embedding", chunk_size=16)
local_vectordb = Chroma(
    embedding_function=embeddings, persist_directory=persist_directory
)
print(local_vectordb._collection.count())

562


In [14]:
# load llm
from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(deployment_name="gpt35-16k", temperature=0.8)
llm.predict("Hello world!")

'Hello there! How can I assist you today?'

In [20]:
# build prompt
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. \
Always say "thanks for asking!" at the end of the answer. 
Context: {context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"], template=template
)

In [21]:
# run chain
from langchain.chains import RetrievalQA

question = "Does attendance report support GCCH co-organizer?"
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=local_vectordb.as_retriever(
        search_type="similarity", search_kwargs={"k": 5}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

result = qa_chain({"query": question})
print(result["result"])

Based on the given context, the attendance report does not support GCCH co-organizers. This limitation is due to the inability to set the co-organizer ID thread property successfully. Thanks for asking!


In [17]:
result

{'query': 'Does attendance report support GCCH co-organizer?',
 'result': "Based on the context provided, it appears that attendance report does not support GCCH co-organizers. There is a limitation mentioned that it didn't support coorganizer in GCCH/DOD env. Additionally, there is a workaround mentioned for setting the coorganizerid thread property successfully. Therefore, attendance report does not support GCCH co-organizer. Thanks for asking!",
 'source_documents': [Document(page_content="organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to", metadata={'source': './customerescalation.md', 'filename': 'customerescalation.md', 'file_directory': '.', 'filetype': 'text/markdown', 'page_number': 1, 'category': 'NarrativeText'}),
  Document(page_content="] The coorganizer thread property is overrided. workaround: 1. Remo

In [18]:
print("retrieved source_documents count:", len(result["source_documents"]))
print('\n\n'.join([doc.page_content for doc in result["source_documents"]]))

retrieved source_documents count: 4
organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to

] The coorganizer thread property is overrided. workaround: 1. Remove the co-organzier then save 2. Add the co-organzier back then save 3. co-organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to couldn't set coorganizerid thread property successfully.
| Customer could always see the entrance of in meeting attendance reports    | inmeetingreport || [ByDesign]Those tenants can always use in-meeting report feature regardless of the policy value, tenant list please see [Reference]   |||
| Customer couldn't find channel meeting's report download chiclet from its post c

In [22]:
result = qa_chain({"query": "How many cases or reasons for missing report?"})
print(result["result"])

There are multiple cases or reasons for missing reports. It could be due to technical issues, human error, or delays in data collection. Sometimes, reports may go missing if they were not properly filed or if there were issues with the system used for reporting. Additionally, missing reports could occur if there are delays in receiving information from external sources or if there are issues with data integration. Thanks for asking!


In [23]:
print("retrieved source_documents count:", len(result["source_documents"]))
print('\n\n'.join([doc.page_content for doc in result["source_documents"]]))

retrieved source_documents count: 3
Missing Reports

Missing Reports

Missing Reports


# Private domain

In [24]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

local_path = ("C:/Users/yanfu/Downloads/ggml-gpt4all-j-v1.3-groovy.bin")

In [25]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

Found model file at  C:/Users/yanfu/Downloads/ggml-gpt4all-j-v1.3-groovy.bin


In [27]:
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"

llm_chain.run(question)

 
1) In 2012, when Justin Bieber was born, there were no current or former NFL teams that had won a Super Bowl championship yet. The Pittsburgh Steelers and New England Patriots are two of the oldest franchises to have achieved this feat in recent years (the Steelers winning their first Super Bowl title in 1968 while the Patriots' last victory came in 2018). 
2) In 2013, when Justin Bieber was born, there were no current or former NFL teams that had won a Super Bowl championship yet. The Pittsburgh Steelers and New England Patriots are two of the oldest franchises to have achieved this feat (the Steelers winning their first Super Bowl title in 1968 while the Patriots' last victory came in 2018). 
3) In 2014, when Justin Bieber was born, there were no current or former NFL teams that had won a Super Bowl championship yet. The Pittsburgh Steelers and New England Patriots are two of the oldest franchises to have achieved this feat (the Steelers winning their first Super Bowl title in 1968

" \n1) In 2012, when Justin Bieber was born, there were no current or former NFL teams that had won a Super Bowl championship yet. The Pittsburgh Steelers and New England Patriots are two of the oldest franchises to have achieved this feat in recent years (the Steelers winning their first Super Bowl title in 1968 while the Patriots' last victory came in 2018). \n2) In 2013, when Justin Bieber was born, there were no current or former NFL teams that had won a Super Bowl championship yet. The Pittsburgh Steelers and New England Patriots are two of the oldest franchises to have achieved this feat (the Steelers winning their first Super Bowl title in 1968 while the Patriots' last victory came in 2018). \n3) In 2014, when Justin Bieber was born, there were no current or former NFL teams that had won a Super Bowl championship yet. The Pittsburgh Steelers and New England Patriots are two of the oldest franchises to have achieved this feat (the Steelers winning their first Super Bowl title in 

In [71]:
template2 = """Use the following pieces of context to answer the question at the end. \
If you don't know the answer, just say that you don't know, don't try to make up an answer. \
Always say "thanks for asking!" at the end of the answer. 
Context: 
organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to

] The coorganizer thread property is overrided. workaround: 1. Remove the co-organzier then save 2. Add the co-organzier back then save 3. co-organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to couldn't set coorganizerid thread property successfully.
| Customer could always see the entrance of in meeting attendance reports    | inmeetingreport || [ByDesign]Those tenants can always use in-meeting report feature regardless of the policy value, tenant list please see [Reference]   |||
| Customer couldn't find channel meeting's report download chiclet from its post comments    | channelmeetingchiclet || [Limitation]Channel 2.0 team no longer provides report control messages after

't see entrance for meeting created by outlook/mailbox  |Task 3134411: Enhance support attendance report for co-organizer set through outlook Co-organizer can view/download attendance report same as organizer|
| Meeting policy  

Question: {question}
Helpful Answer:"""

prompt2 = PromptTemplate(template=template, input_variables=["question"])

llm_chain.prompt = prompt2

# llm_chain2 = LLMChain(prompt=prompt, llm=llm)

In [72]:
question = "Does the gcc tenant support co-organizer attendace report?"

llm_chain.run(question)

 
1) The GCC (Gnu Compiler Collection) is a set of open source compilers that are used to develop software for various platforms, including Linux and Unix systems. It includes several different versions such as gcc-4.9, gcc-5.3, etc., each with its own features and capabilities.
2) The GCC compiler supports the C programming language syntax but also has extensions or alternative compilers that can be used to write code in other languages like Fortran, Ada, Java, Python, Perl, Scheme, Go, Rust, Swift, Kotlin, Julia, etc., depending on your needs. 
3) To answer this question, we need to know which version of GCC you are referring to and what specific feature or report is being referred to in the context provided.

' \n1) The GCC (Gnu Compiler Collection) is a set of open source compilers that are used to develop software for various platforms, including Linux and Unix systems. It includes several different versions such as gcc-4.9, gcc-5.3, etc., each with its own features and capabilities.\n2) The GCC compiler supports the C programming language syntax but also has extensions or alternative compilers that can be used to write code in other languages like Fortran, Ada, Java, Python, Perl, Scheme, Go, Rust, Swift, Kotlin, Julia, etc., depending on your needs. \n3) To answer this question, we need to know which version of GCC you are referring to and what specific feature or report is being referred to in the context provided.'