# QA over Documents

In [1]:
# import basic openai keys
import os
import sys

import openai
from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv())

openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_version = os.environ["OPENAI_API_VERSION"]
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_type = os.environ["OPENAI_API_TYPE"]

In [2]:
# # split docs to token chunks

# from langchain.text_splitter import TokenTextSplitter
# from langchain.document_loaders import UnstructuredMarkdownLoader

# chunk_size = 200
# chunk_overlap = 40

# source_file_path = "./data/document.md"
# loader = UnstructuredMarkdownLoader(
#     source_file_path,
#     mode="elements",
#     strategy="fast",
# )
# raw_markdown_docs = loader.load()


# token_spliter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# doc_chunks = token_spliter.split_documents(raw_markdown_docs)

# print(
#     f"chunk_size: {chunk_size}\n\
# chunk_overlap: {chunk_overlap}\n\n\
# result chunks count: {len(doc_chunks)}"
# )

In [3]:
from langchain.document_loaders.csv_loader import CSVLoader

csvLoader = CSVLoader(file_path='./data/recentIncidents.csv', encoding='utf8')
raw_incidents_table = csvLoader.load()

In [4]:
# split docs to header chunks
# config chunk_size & chunk_overlap need to tune in the following debugging

from langchain.text_splitter import MarkdownHeaderTextSplitter
import markdown

with open('./data/document.md', 'r') as f:
    text = f.read()

headers_to_split_on = [
    ("#", "Header 1"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
doc_chunks = markdown_splitter.split_text(text)
len(doc_chunks)


8

In [5]:
# append csv into md
doc_chunks_contents = [doc.page_content for doc in doc_chunks]

In [6]:
# save chunks as vector into db
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

persist_directory = "./chroma/"
embeddings = OpenAIEmbeddings(deployment="embedding", chunk_size=16)

vectorstore = Chroma.from_documents(
    documents=raw_incidents_table,
    embedding=embeddings,
    persist_directory=persist_directory,
)

vectorstore.add_texts(texts = doc_chunks_contents)

print(
    f"Chroma vectordb successfully created, its collection count: {vectorstore._collection.count()}"
)

# test the vectordb query function
test_question = "Does the gcc tenant support co-organizer attendace report?"
res_docs = vectorstore.similarity_search(test_question, k=5)
print(res_docs[0])

# save again, this is not necessary, due to we haven't change db after init
vectorstore.persist

Chroma vectordb successfully created, its collection count: 1467
page_content="ID: 381623065\nTitle: Online: GCC High | Leidos | Unable to download Teams Attendance\nScenario-Type: Missing Report\nScenario-SubType: NoReportInTab\nTags: \nScenario-Describe【role-env-meettype-scenario】: Organizer couldn't see any attendance report\nMitigate-Category: Bug(NeedCodeFix)\nMitigate-Rootcause: Ignore tenant id when request report\nAction: FE - Fix the invalid request url format\nMeetingType: \nEnv: GCC\nCodeChange: Yes/Done\nDays: 34.31\nBack forth possible reason: Yes, GCC data is another db. gcc data need access. support provide wrong har/take long time." metadata={'row': 28, 'source': './data/recentIncidents.csv'}


<bound method Chroma.persist of <langchain.vectorstores.chroma.Chroma object at 0x000001FB75B7B890>>

In [7]:
# load the exist local vectorstore
persist_directory = "./chroma/"
embeddings = OpenAIEmbeddings(deployment="embedding", chunk_size=16)
local_vectordb = Chroma(
    embedding_function=embeddings, persist_directory=persist_directory
)
print(local_vectordb._collection.count())

1467


In [8]:
# load llm
from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(deployment_name="gpt35-16k", temperature=0.8)
llm.predict("Hello world!")

'Hello! How can I assist you today?'

In [9]:
# build prompt
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. \
Always say "thanks for asking!" at the end of the answer. 
Context: {context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"], template=template
)

In [10]:
# run chain
from langchain.chains import RetrievalQA

question = "Does attendance report support GCCH co-organizer?"
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=local_vectordb.as_retriever(
        search_type="similarity", search_kwargs={"k": 5}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

result = qa_chain({"query": question})
print(result["result"])

Yes, the attendance report does support GCCH co-organizers. Thanks for asking!


In [11]:
result

{'query': 'Does attendance report support GCCH co-organizer?',
 'result': 'Yes, the attendance report does support GCCH co-organizers. Thanks for asking!',
 'source_documents': [Document(page_content="'t see entrance for meeting created by outlook/mailbox  |Task 3134411: Enhance support attendance report for co-organizer set through outlook Co-organizer can view/download attendance report same as organizer|\n| Meeting policy  ", metadata={'category': 'NarrativeText', 'file_directory': '.', 'filename': 'customerescalation.md', 'filetype': 'text/markdown', 'page_number': 1, 'source': './customerescalation.md'}),
  Document(page_content="ID: 384019674\nTitle: Online: GCCH | Oshkosh Corporation - oskgovus | Issue: Users are unable to download attendance reports.\nScenario-Type: Missing Report\nScenario-SubType: NoReportInTab\nTags: \nScenario-Describe【role-env-meettype-scenario】: Organizer couldn't see any attendance report\nMitigate-Category: Bug(NeedCodeFix)\nMitigate-Rootcause: Ignore t

In [12]:
print("retrieved source_documents count:", len(result["source_documents"]))
print('\n\n'.join([doc.page_content for doc in result["source_documents"]]))

retrieved source_documents count: 5
't see entrance for meeting created by outlook/mailbox  |Task 3134411: Enhance support attendance report for co-organizer set through outlook Co-organizer can view/download attendance report same as organizer|
| Meeting policy  

ID: 384019674
Title: Online: GCCH | Oshkosh Corporation - oskgovus | Issue: Users are unable to download attendance reports.
Scenario-Type: Missing Report
Scenario-SubType: NoReportInTab
Tags: 
Scenario-Describe【role-env-meettype-scenario】: Organizer couldn't see any attendance report
Mitigate-Category: Bug(NeedCodeFix)
Mitigate-Rootcause: Ignore tenant id when request report
Action: FE - Fix the invalid request url format
MeetingType: 
Env: GCC
CodeChange: Yes/Done
Days: 15.3
Back forth possible reason: 

ID: 390137157
Title: Online: Co-organizer can't access meeting attendance report
Scenario-Type: Missing Report
Scenario-SubType: mailbox
Tags: Coorganizer, mailbox
Scenario-Describe【role-env-meettype-scenario】: Cor-organiz

In [13]:
result = qa_chain({"query": "How many cases or reasons for missing report?"})
print(result["result"])

There are five cases or reasons for a missing report. Thanks for asking!


In [14]:
print("retrieved source_documents count:", len(result["source_documents"]))
print('\n\n'.join([doc.page_content for doc in result["source_documents"]]))

retrieved source_documents count: 5
Missing Reports

Missing Reports

Missing Reports

There are five key scenarios when do the triage of the issue. Missing entrance, Missing Reports, Download failed, Missing Data, Wrong Report Content

Missing entrance, Missing Reports, Download failed, Missing Data, Wrong Report Content


# Private domain

In [15]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

local_path = ("C:/Users/yanfu/Downloads/ggml-gpt4all-j-v1.3-groovy.bin")

In [16]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

ImportError: Could not import gpt4all python package. Please install it with `pip install gpt4all`.

In [None]:
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"

llm_chain.run(question)

In [None]:
template2 = """Use the following pieces of context to answer the question at the end. \
If you don't know the answer, just say that you don't know, don't try to make up an answer. \
Always say "thanks for asking!" at the end of the answer. 
Context: 
organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to

] The coorganizer thread property is overrided. workaround: 1. Remove the co-organzier then save 2. Add the co-organzier back then save 3. co-organizer restart the client and check attendance report, tracked by Task 3183235: Standard solution to enhance the support attendance report for coorganizer. [Limitation] Didn't support coorganizer in GCCH/DOD env, due to couldn't set coorganizerid thread property successfully.
| Customer could always see the entrance of in meeting attendance reports    | inmeetingreport || [ByDesign]Those tenants can always use in-meeting report feature regardless of the policy value, tenant list please see [Reference]   |||
| Customer couldn't find channel meeting's report download chiclet from its post comments    | channelmeetingchiclet || [Limitation]Channel 2.0 team no longer provides report control messages after

't see entrance for meeting created by outlook/mailbox  |Task 3134411: Enhance support attendance report for co-organizer set through outlook Co-organizer can view/download attendance report same as organizer|
| Meeting policy  

Question: {question}
Helpful Answer:"""

prompt2 = PromptTemplate(template=template, input_variables=["question"])

llm_chain.prompt = prompt2

# llm_chain2 = LLMChain(prompt=prompt, llm=llm)

In [None]:
question = "Does the gcc tenant support co-organizer attendace report?"

llm_chain.run(question)