# Experimental - QA 
**by Instructor Embedding, GPT4ALL, Langchain, Chromadb**  


Ref:
- [Question Answering over Docs](https://python.langchain.com/en/latest/use_cases/question_answering.html)
- [Langchain Integration - GPT4All](https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html)
- [Retrieval Question/Answering](https://python.langchain.com/en/latest/modules/chains/index_examples/vector_db_qa.html)
- [HKUNLP/instructor-embedding](https://github.com/HKUNLP/instructor-embedding/)

## Import packages

In [None]:
# The Embedding Model
from langchain.embeddings import HuggingFaceInstructEmbeddings

# The Inference LLM 
from langchain.llms import GPT4All

from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA

from langchain.document_loaders import TextLoader


## Setup models

In [None]:
# For embedding
model_name = "hkunlp/instructor-large"
model_kwargs = {'device': 'cuda'}
hf_instructor_embedding = HuggingFaceInstructEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs
)

In [None]:
# For LLM
model_local_path = './models/'
model_name = 'ggml-mpt-7b-instruct.bin'


model_full_local_path = model_local_path + model_name
llm = GPT4All(model=model_full_local_path, verbose=False, n_predict=2048, n_ctx=512, n_threads=6)

## Prepare doc

In [None]:
loader = TextLoader("dataset/kxxxxx.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
texts = text_splitter.split_documents(documents)

docsearch = Chroma.from_documents(texts, hf_instructor_embedding)


qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    # return_source_documents=True
)


## Evaluation

In [None]:
%%time


query = '''why should I rebuild mcpd database?'''
result = qa({"query": query})

In [None]:
%%time

query = '''How can I rebuild mcpd databse on version 16?'''
result = qa({"query": query})

In [None]:
query = '''What consist the main territory?'''
result = qa({"query": query})
print (result["result"])