## Import Packages

Load the various libraries that will be needed in this tutorial, including all the langchain libraries we will use.

In [None]:
# vector DB
import os
import pandas as pd
from getpass import getpass
import kdbai_client as kdbai
import time
import multiprocessing

In [None]:
# langchain packages
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import KDBAI
from langchain_openai import OpenAI
from langchain.chains.question_answering import load_qa_chain

## Set API Keys

To follow this example you will need to request an [Open API key](https://platform.openai.com/apps).

You can create one for free using the link provided. Once you have the credentials you can add them below. 

In [None]:
os.environ["OPENAI_API_KEY"] = (
    os.environ["OPENAI_API_KEY"]
    if "OPENAI_API_KEY" in os.environ
    else getpass("OpenAI API Key:")
)

## Load and split the data into chunks.

In [None]:
# LOAD IN THE TorQ DOCUMENTATION
loader = TextLoader('./TorQ+Conf.txt')
doc = loader.load()
# Chunk the documents into 500 character chunks using langchain's text splitter "RucursiveCharacterTextSplitter"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)

#split_documents produces a list of all the chunks created, printing out first chunk for example
chunks = [p.page_content for p in text_splitter.split_documents(doc)]

## Store Embeddings in KDB.AI

In [None]:
# Define an OpenAI text embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [None]:
# Establish KDB.AI Server session
session = kdbai.Session(endpoint="http://localhost:8082")

In [None]:
# Define Schema
rag_schema = {
    "columns": [
        {"name": "id", "pytype": "str"},
        {"name": "text", "pytype": "bytes"},
        {
            "name": "embeddings",
            "pytype": "float32",
            "vectorIndex": {"dims": 1536, "metric": "L2", "type": "flat"},
        },
    ]
}

In [None]:
# First ensure the table does not already exist
try:
    session.table("rag_langchain").drop()
    time.sleep(5)
except kdbai.KDBAIException:
    pass
table = session.create_table("rag_langchain", rag_schema)

In [7]:
# use KDBAI as vector store
vecdb_kdbai = KDBAI(table, embeddings)
# Foo function
def foo(n):
    vecdb_kdbai.add_texts(texts=chunks)

if __name__ == '__main__':
    # Start foo as a process
    p = multiprocessing.Process(target=foo, name="Foo", args=(5,))
    p.start()
# Wait for a 5 seconds max for foo
# Usage: join([timeout in seconds])
p.join(5)

# If thread is active
if p.is_alive():
    print("Kill the add_texts Function")

    # Terminate foo
    p.terminate()
    p.join()
vecdb_kdbai.aadd_texts(texts=chunks)

Kill the add_texts Function


<coroutine object VectorStore.aadd_texts at 0x7efdc09920a0>

## Create Q&A RAG bot

Create a Q&A bot using the OpenAI model gpt-3.5-turbo-16k and the vecdb_kdbai vector store containing the embedded TorQ documentation. 

In [8]:
K = 10
qabot = RetrievalQA.from_chain_type(chain_type='stuff',
                                    llm=ChatOpenAI(model='gpt-3.5-turbo-16k', temperature=0.0), 
                                    retriever=vecdb_kdbai.as_retriever(search_kwargs=dict(k=K)),
                                    return_source_documents=True)

In [10]:
query ="What is STP?"
print(f'\n\n{query}\n')
print(qabot.invoke(dict(query=query))['result'])



What is STP?

Based on the given context, there is no mention of STP (Shortest Path) or any other relevant acronym that could be associated with "STP." Therefore, I don't have enough information to answer your question.


In [38]:
session.table("rag_langchain").drop()

True

In [39]:
session.list()

[]