In [None]:
# dependencies
%pip install pandas pypdf python-dotenv openai langchain-iris langchain tiktoken langchain-community langchain-core

In [20]:
# load OpenAI APIKEY from env
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv('c:\\AIWebinar\\.env')

In [21]:
# OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# text loading and splitting
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader


# IRIS as vector store
from langchain_iris import IRISVector

# parse response from llm
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [22]:
# open llm model
llm_model = "gpt-3.5-turbo"

# load text & split in chunks
pdf_path = "c:\\AIWebinar\\FHIREmails.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

# function to use to calculate vectors (embeddings) from text
embeddings = OpenAIEmbeddings()


In [None]:
# IRIS connection string
username = 'superuser'
password = 'SYS' 
hostname = 'localhost'
port = '51787' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
print(CONNECTION_STRING)

In [24]:
# load documents (vectors from splitted text)
# this will create the collection
COLLECTION_NAME = "fhirmemospdf"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [None]:
# use the following if you are connecting to an existing collection
#db = IRISVector(
#    embedding_function=embeddings,
#    collection_name=COLLECTION_NAME,
#    connection_string=CONNECTION_STRING,
#)

In [None]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

# Questions & Answers using documents as context

In [26]:
# create llm
llm = ChatOpenAI(temperature=0.0, model=llm_model)

In [27]:
# response schema to parse response afterwards
rsp_schema = ResponseSchema(
    name="rsp",
    description="response to question",
    type="string"
)

# prompt response schema
response_schemas = [rsp_schema]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [28]:
query_template = """\
You are a chat bot assistant at our health organization that helps employees with internal company information.
Using the context, provide a comprehensible and clear response that will answer the employee's question.
Your answer must be in the same language that the question is asked. 

{format_instructions}

Use the following context:
{context}

Question:
{question}

Do not use any other information.
"""

In [29]:
# build prompt
from langchain.prompts import PromptTemplate
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "query"],
    partial_variables={"format_instructions": format_instructions},
    template=query_template,
)

In [33]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": QA_CHAIN_PROMPT
    }
)

In [None]:
result = qa_chain("On what date did Moshe and Rivka meet to discuss the InterSystems IRIS FHIR Server's architecture in their organization?")
#result = qa_chain("When was the final deployment of FHIR server complete?")

#result = qa_chain("מתי התקיימה הפגישה בין משה ורבקה לטובת השרתים של אינטרסיסטמס?")

print(result)

In [None]:
# extract actual response
output_dict = output_parser.parse(result["result"])
output_dict