## Retrieval Agents

In [2]:
# We have seen that conversational agents can struggle with data freshness, domain specific data or accessing confidential or organizational internal documents. This problem can be alleviated by using using Retrieval Augmentation tools.


# On the other sid using naive retrieval augmentation tool without the use of agents means we will retrieve contexts with every query. Again, this isnt always ideal as nto every query requires access to external knowledge

## Building the Knowledge Base

In [3]:
from datasets import load_dataset

In [4]:
data = load_dataset('medical_dialog','processed.en',split='train')

In [5]:
data

Dataset({
    features: ['description', 'utterances'],
    num_rows: 482
})

In [6]:
utterances = []
for i in range(len(data)):
   utterances.append(''.join(data['utterances'][i]))

In [7]:
data1 = data.to_pandas()

In [8]:
data1['utterances'] = utterances

In [9]:
data1.head()

Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,patient: throat a bit sore and want to get a g...
1,"hey there i have had cold ""symptoms"" for over ...","patient: hey there i have had cold ""symptoms"" ..."
2,i have a tight and painful chest with a dry co...,patient: i have a tight and painful chest with...
3,what will happen after the incubation period f...,patient: what will happen after the incubation...
4,suggest treatment for pneumonia,patient: just found out i was pregnant. yester...


In [10]:
data1 = data1.reset_index()

## Initialize the Embedding Model and Vector DB

In [11]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings.openai import OpenAIEmbeddings

In [12]:
#initialize the embedding object


embed = OpenAIEmbeddings(openai_api_base=os.environ['OPENAI_REVERSE_PROXY'],
                         openai_api_key= os.environ['OPENAI_API_KEY'])

## Initialize the pinecone connection

In [13]:
import pinecone

index_name = 'langchain-retrieval-agent'


pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment= 'asia-southeast1-gcp-free'
)


## Create the index if it doesnt exist

In [14]:
if index_name not in pinecone.list_indexes():
    # we create a new index

    pinecone.create_index(
        name = index_name,
        metric = 'dotproduct',
        dimension = 1536
    )

## Then connect to the index

In [15]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 21}},
 'total_vector_count': 21}

## Now comes the main part ie Indexing

In [16]:
data1['utterances']

0      patient: throat a bit sore and want to get a g...
1      patient: hey there i have had cold "symptoms" ...
2      patient: i have a tight and painful chest with...
3      patient: what will happen after the incubation...
4      patient: just found out i was pregnant. yester...
                             ...                        
477    patient: my 5 year old son woke up not feeling...
478    patient: i have a dry cough and sore throat- i...
479    patient: how do i know if i have a normal cold...
480    patient: hi- i was diagnosed a month ago with ...
481    patient: i have a aunt that is in the hospital...
Name: utterances, Length: 482, dtype: object

In [17]:
from uuid import uuid4
data1['id'] = [uuid4() for i in range(len(data1))]

In [18]:
data1.head()

Unnamed: 0,index,description,utterances,id
0,0,throat a bit sore and want to get a good imune...,patient: throat a bit sore and want to get a g...,79bb5dee-31b9-4a05-a3b0-49a34da22967
1,1,"hey there i have had cold ""symptoms"" for over ...","patient: hey there i have had cold ""symptoms"" ...",d77cc443-49e5-4792-a56d-68bb0826d3ae
2,2,i have a tight and painful chest with a dry co...,patient: i have a tight and painful chest with...,21b07131-28c7-4606-9055-9b53c9eb144e
3,3,what will happen after the incubation period f...,patient: what will happen after the incubation...,b454da53-3bff-43b1-968c-ae88884293c0
4,4,suggest treatment for pneumonia,patient: just found out i was pregnant. yester...,d8945682-f991-4abb-a6be-c6f40cc0f19d


In [19]:
#We can perform the indexing task using the LangChain vector store object. But for now it is much faster  to do it via the Pinecone python client directly. We will do this in batches of 100 or more.


from tqdm.auto import  tqdm
from uuid import uuid4

batch_size = 100


texts = []
metadatas = []

for i in tqdm(range(0, len(data1), batch_size)):

    #get end of batch
    i_end = min(len(data), i+batch_size)
    batch = data1.iloc[i:i_end]

    # first get metadata fields for this record

    metadatas = [{
        'title': record['description'],
        'text': record['utterances']
    } for j, record in batch.iterrows()]

    # get the list of contexts/ documents
    documents = batch['utterances']
    # create document embeddings
    embeds = embed.embed_documents(documents)
    # get IDs
    ids = str(batch['id'])
    # add everything to pinecone
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 21}},
 'total_vector_count': 21}

## Creating a Vector Store and Querying

In [21]:
from langchain.vectorstores import Pinecone
text_field = "text"


# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(index, embed.embed_query, text_field)




In [22]:
query = "I am having a dry cough, and high head ache, does it mean I have covid?"

vectorstore.similarity_search(query, 10)

# %%


[Document(page_content="patient: i have a dry cough and sore throat- it's been a week now and the cough seems to be getting worse- no runny nose or fever, sometimes a headache, no shortness of breath...should i get tested for covid19?doctor: in brief: covid good guidelines can be found at cdc. gov/coronavirus/2019. you would be considered low risk. symptoms last up to two weeks. high probability many will get the disease and testing leads to no change in action at this time. no obvious therapy (however some encouraging possibilites) and for most people the virus runs its course without incidence. stay put and talk to your provider. would you like to video or text chat with me?", metadata={'title': "i have a dry cough and sore throat- it's been a week now and the cough seems to be getting worse- no runny nose or fever, sometimes a headache, no shortness of breath...should i get tested for covid19?"}),
 Document(page_content='patient: how do i know if i have a normal cold or maybe the co

## Now Lets initialize the conversational agent

In [23]:
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
import os
from dotenv import  load_dotenv
load_dotenv()

True

## Initialize the LLM

In [24]:
# Chat completion llm

llm = OpenAI(
   temperature=0,
   openai_api_base= os.environ['OPENAI_REVERSE_PROXY'],
   openai_api_key=os.environ['OPENAI_API_KEY']
)

## Initialize the memory

In [25]:
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)

## Retrieval QA Chain

In [26]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

In [27]:
qa.run("Why am i having dry cough with breathing trouble")

' It is possible that you are having dry cough with breathing trouble due to the coronavirus infection, but it is not the common symptom. It is also possible that the dry cough and breathing trouble could be due to asthma or allergies. It is recommended that you contact your doctor for advice and follow up, and consider getting a chest x-ray if necessary.'

## But this inst yet ready for our covnersational agent.For that we need to convert this retrieval chain into a tool. We do that like so:

In [34]:
from langchain.agents import Tool

tools = [
    Tool(
        name = "knowledge Base",
        func = qa.run,
        description = "use this tool when answering general knowledge queries to get more information about the topic"
    )
]

## Initialize the agent

In [32]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent = "chat-conversational-react-description",
    tools = tools,
    llm = llm,
    verbose = True,
    max_iterations = 3,
    early_stopping_method  = 'generate',
    memory = conversational_memory
)

## Using the Conversational Agent

In [35]:
agent("What kind of disease is covid? How can i know that I dont have covid?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

RESPONSE
--------------------
```json
{
    "action": "knowledge Base",
    "action_input": "COVID-19"
}
```[0m
Observation: [36;1m[1;3m Yes, please call the hotline on 0800 029 999, see the faqs on https://www.gov.za/coronavirus/faq or https://www.who.int/news-room/q-a-detail/q-a-coronaviruses and self-quarantine and monitor yourself for 14 days from return date. If symptoms develop, please contact the NICD for screening to see if you have to get tested.[0m
Thought:[32;1m[1;3m

AI:

RESPONSE
--------------------
```json
{
    "action": "Final Answer",
    "action_input": "If you have recently returned from a trip, it is important to self-quarantine and monitor yourself for 14 days from the return date. If any symptoms develop, contact the National Institute for Communicable Diseases (NICD) for screening and to see if you need to get tested for COVID-19."
}
```[0m

[1m> Finished chain.[0m


{'input': 'What kind of disease is covid? How can i know that I dont have covid?',
 'chat_history': [HumanMessage(content='I am having a dry cough, and high head ache, does it mean I have covid?', additional_kwargs={}, example=False),
  AIMessage(content='Based on the information obtained from the knowledge base, common symptoms of COVID-19 include fever, dry cough, fatigue, and loss of taste or smell. Other symptoms may include shortness of breath, body aches, sore throat, headache, diarrhea, and congestion or runny nose.', additional_kwargs={}, example=False)],
 'output': 'If you have recently returned from a trip, it is important to self-quarantine and monitor yourself for 14 days from the return date. If any symptoms develop, contact the National Institute for Communicable Diseases (NICD) for screening and to see if you need to get tested for COVID-19.'}

In [36]:
pinecone.delete_index(index_name)