## Retrieval Augmented Generation from Medical Documents & Custom Conversational Agent

#### 1. Imports and Classes Definition

In [1]:
import os
import pandas as pd
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AzureOpenAI
from ast import literal_eval
import numpy as np


DOCS_PATH = "/lakehouse/default/Files/skin_conditions_articles/"
API_KEY = "" # paste your api key here
AZURE_ENDPOINT = "" # paste your endpoint here
API_VERSION = "2023-05-15"


class DocumentManager:
    def __init__(self, docs_path="", api_key=API_KEY, azure_endpoint=AZURE_ENDPOINT, api_version=API_VERSION):
        self.docs_path = docs_path
        self.client = AzureOpenAI(api_key=api_key, azure_endpoint=azure_endpoint, api_version=api_version) if api_key and azure_endpoint else None
        self.documents = []
        self.df = pd.DataFrame()

    def load_and_split_documents(self, chunk_size=1500, chunk_overlap=200):
        print("Loading and splitting documents...")
        for filename in os.listdir(self.docs_path):
            print(f"Loading... {filename}")
            loader = Docx2txtLoader(f"{self.docs_path}/{filename}")
            self.documents.extend(loader.load())

        print(f"Loaded a total of {len(self.documents)} documents.")
        print("-" * 80)
        print("Splitting the documents into chunks...")

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        self.documents = text_splitter.split_documents(self.documents)

        print(f"Documents were split into: {len(self.documents)} chunks.")

    def documents_to_dataframe(self):
        print("Converting documents to DataFrame...")
        doc_titles = []
        chunk_contents = []
        chunk_ids = []
        doc_chunk_counter = {}
        
        for doc in self.documents:
            source_path = doc.metadata['source']
            doc_title = source_path.split('/')[-1]
            
            if doc_title in doc_chunk_counter:
                doc_chunk_counter[doc_title] += 1
            else:
                doc_chunk_counter[doc_title] = 1
            
            doc_titles.append(doc_title)
            chunk_contents.append(doc.page_content)
            chunk_ids.append(doc_chunk_counter[doc_title])
        
        self.df = pd.DataFrame({
            'doc_title': doc_titles,
            'chunk_content': chunk_contents,
            'chunk_id': chunk_ids
        })

    def prepare_embeddings(self, model="text-embedding-ada-002"):
        print("Calculating embeddings...")
        if self.df.empty:
            print("Dataframe is empty. Ensure documents are loaded and DataFrame is prepared.")
            return
        embeddings = [self.get_embedding(text, model=model) for text in self.df["chunk_content"]]
        self.df['ada_embedding'] = embeddings

    def process_documents(self, model="text-embedding-ada-002"):
        self.load_and_split_documents()
        self.documents_to_dataframe()
        self.prepare_embeddings(model)

    def save_dataframe(self, path):
        print(f"Saving DataFrame to {path}")
        self.df.to_csv(path, index=False)

    def load_dataframe(self, path):
        print(f"Loading DataFrame from {path}")
        df = pd.read_csv(path)
        df['ada_embedding'] = df.ada_embedding.apply(eval).apply(np.array)
        self.df = df
        print("DataFrame loaded successfully.")
        
    def get_embedding(self, text, model="text-embedding-ada-002"):
        if not self.client:
            raise Exception("AzureOpenAI client not initialized. Please provide an API key and Azure endpoint.")
        response = self.client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding

    def cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) 
    
    def search_docs(self, user_query, top_n=4, to_print=False):
        embedding = self.get_embedding(user_query)
        self.df["similarities"] = self.df.ada_embedding.apply(lambda x: self.cosine_similarity(np.array(x), embedding))
        res = self.df.sort_values("similarities", ascending=False).head(top_n)
        if to_print:
            print(res[['doc_title', 'similarities']])
        return res



class ConversationalAgent:
    def __init__(self, document_manager, max_messages=20):
        # document_manager is an instance of DocumentManager
        self.doc_manager = document_manager
        self.base_system_message = "You are an AI assistant that helps people find information. You have access to medical articles and can come up with answers based on them. Given the conversation history and the retrieved chunks from the articles, please formulate an answer and ask the user at the end if they need any more information. If not, instruct them to type 'exit'."
        self.system_message = self.base_system_message
        self.max_messages = max_messages
        self.messages = [{"role": "system", "content": self.system_message}]
        
        # Ensure the DataFrame within document_manager is already prepared
        if self.doc_manager.df.empty:
            print("Warning: DocumentManager DataFrame is empty. Make sure to either load or process documents before initiating the conversation.")

    def search_documents_and_update_message(self, user_query, top_n=3):
        if self.doc_manager.df.empty:
            print("Document DataFrame is not available. Skipping document search.")
            return
        
        # Perform the search
        top_results = self.doc_manager.search_docs(user_query, top_n=top_n, to_print=True)
        
        # Generate the updated system message including top search results
        top_chunk_contents = "\n".join(f"Document {idx+1}: {content}" for idx, content in enumerate(top_results["chunk_content"].values))
        updated_system_message = f"{self.base_system_message} Based on your query, here are the top document contents:\n{top_chunk_contents}"
        
        # Update the first message (system message) in history with the new content
        self.messages[0] = {"role": "system", "content": updated_system_message}

    def add_message_to_history(self, role, content):
        if len(self.messages) > self.max_messages:
            self.messages.pop(1)  # Keep the initial system message intact
        self.messages.append({"role": role, "content": content})

    def get_reply(self, model="gpt-35-turbo-16k", temperature=0.7):
        response = self.doc_manager.client.chat.completions.create(
            model=model,
            messages=self.messages,
            temperature=temperature,
            max_tokens=800,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None
        )
        reply = response.choices[0].message.content
        self.add_message_to_history("assistant", reply)
        return reply

    def initiate_conversation(self):
        while True:
            user_input = input("User: ")
            if user_input.lower() == "exit":
                print("Conversation ended.")
                break
            self.add_message_to_history("user", user_input)
            self.search_documents_and_update_message(user_input)  # Update system message with search results
            reply = self.get_reply()
            print("AI: ", reply)

    def process_messages(self, user_messages):
        for user_input in user_messages:
            print("User: ", user_input)
            self.add_message_to_history("user", user_input)
            self.search_documents_and_update_message(user_input)
            reply = self.get_reply()
            print("AI: ", reply)

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 5, Finished, Available)

#### 2. Load, preprocess and embed documents

In [2]:
# First time creating a dataframe from docx files
doc_manager = DocumentManager(docs_path=DOCS_PATH)

# Process them and save csv
doc_manager.process_documents()

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 6, Finished, Available)

Loading and splitting documents...
Loading... Acne KB.docx
Loading... Atopic dermatitis KB.docx
Loading... Cellulitis KB.docx
Loading... Impetigo KB.docx
Loading... Melanoma KB.docx
Loading... Moles KB.docx
Loading... Rosacea KB.docx
Loaded a total of 7 documents.
--------------------------------------------------------------------------------
Splitting the documents into chunks...
Documents were split into: 63 chunks.
Converting documents to DataFrame...
Calculating embeddings...


In [3]:
doc_manager.df

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 7, Finished, Available)

Unnamed: 0,doc_title,chunk_content,chunk_id,ada_embedding
0,Acne KB.docx,Acne\n\nOverview\n\nAcne is a skin condition t...,1,"[0.02543225884437561, -0.012690848670899868, 0..."
1,Acne KB.docx,"For many women, acne can persist for decades, ...",2,"[0.017862524837255478, -0.014667601324617863, ..."
2,Acne KB.docx,Pimples are raised red spots with a white cent...,3,"[0.021826721727848053, -0.00015674378664698452..."
3,Acne KB.docx,Hygiene. Acne isn't caused by dirty skin. In f...,4,"[0.0308134313672781, 0.00526579050347209, 0.04..."
4,Acne KB.docx,Friction or pressure on your skin. This can be...,5,"[0.017383400350809097, 0.009510344825685024, 0..."
...,...,...,...,...
58,Rosacea KB.docx,"If your symptoms involve your eyes, your docto...",3,"[-0.010694081895053387, 0.0058588567189872265,..."
59,Rosacea KB.docx,Other topical products help control the pimple...,4,"[-0.006975547876209021, -0.008463837206363678,..."
60,Rosacea KB.docx,The full effect of the treatment might not be ...,5,"[-0.013953074812889099, 0.01979568414390087, 0..."
61,Rosacea KB.docx,Reduce visible flushing with makeup. Some make...,6,"[-0.010468767955899239, 0.01875840127468109, 0..."


In [None]:
doc_manager.df[:10]

In [4]:
doc_manager.df.dtypes

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 8, Finished, Available)

doc_title        object
chunk_content    object
chunk_id          int64
ada_embedding    object
dtype: object

In [5]:
doc_manager.search_docs(user_query="need advise on dermatitis, how do I treat it effectively?", to_print=False)

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 9, Finished, Available)

Unnamed: 0,doc_title,chunk_content,chunk_id,ada_embedding,similarities
20,Atopic dermatitis KB.docx,Taking care of sensitive skin is the first ste...,11,"[-0.0005007982836104929, 0.04521632939577103, ...",0.873171
23,Atopic dermatitis KB.docx,"Manuka honey. When applied to the skin, manuka...",14,"[0.007145472336560488, 0.028045019134879112, 0...",0.86993
22,Atopic dermatitis KB.docx,Soak from the neck down or just the affected a...,13,"[0.0030105002224445343, 0.027614915743470192, ...",0.862889
16,Atopic dermatitis KB.docx,Atopic dermatitis can be persistent. You may n...,7,"[0.011043604463338852, 0.018386323004961014, 0...",0.862648


In [6]:
# Saving the dataframe
doc_manager.save_dataframe("/lakehouse/default/Files/datasets/embedded_articles.csv")

# load an existing DataFrame
# doc_manager.load_dataframe("/your/path/existing_dataframe.csv")

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 10, Finished, Available)

Saving DataFrame to /lakehouse/default/Files/datasets/embedded_articles.csv


#### 3. Custom Conversational Agent from embedded documents

In [7]:
# Initialize the DocumentManager
doc_manager = DocumentManager()

# Load existing DataFrame with embeddings from a CSV
doc_manager.load_dataframe("/lakehouse/default/Files/datasets/embedded_articles.csv")

# Start the non-interactive conversation
agent = ConversationalAgent(document_manager=doc_manager)

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 11, Finished, Available)

Loading DataFrame from /lakehouse/default/Files/datasets/embedded_articles.csv
DataFrame loaded successfully.


In [8]:
# Check initial messages
agent.messages

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 12, Finished, Available)

[{'role': 'system',
  'content': "You are an AI assistant that helps people find information. You have access to medical articles and can come up with answers based on them. Given the conversation history and the retrieved chunks from the articles, please formulate an answer and ask the user at the end if they need any more information. If not, instruct them to type 'exit'."}]

In [9]:
# List of questions
user_messages = [
    "How should I treat my acne?",
    "What are the symptoms of dermatitis?",
    # "Tell me more about skin care.",
]

agent.process_messages(user_messages)

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 13, Finished, Available)

User:  How should I treat my acne?
      doc_title  similarities
4  Acne KB.docx      0.864865
5  Acne KB.docx      0.852255
8  Acne KB.docx      0.847002
AI:  If you have tried over-the-counter acne products and they haven't helped, it is recommended to consult with a dermatologist for further evaluation and treatment options. A dermatologist can help you control your acne, avoid scarring or other damage to your skin, and make scars less noticeable. Prescription-strength medications may be prescribed, which can include topical medications and drugs taken orally. Topical medications commonly used for acne include retinoids (such as tretinoin, adapalene, and tazarotene) and antibiotics (often combined with benzoyl peroxide). It is important to discuss the risks and benefits of medications and other treatments with your doctor. Additionally, therapies such as light therapy, chemical peels, and drainage/extraction procedures may be considered. Regular follow-up appointments with your doct

In [10]:
# Check messages after conversation
agent.messages

StatementMeta(, b110f5ed-4953-434d-8c5e-248121d827af, 14, Finished, Available)

[{'role': 'system',
  'content': "You are an AI assistant that helps people find information. You have access to medical articles and can come up with answers based on them. Given the conversation history and the retrieved chunks from the articles, please formulate an answer and ask the user at the end if they need any more information. If not, instruct them to type 'exit'. Based on your query, here are the top document contents:\nDocument 1: Asthma and hay fever.\xa0Many people with atopic dermatitis develop asthma and hay fever. This can happen before or after developing atopic dermatitis.\n\nFood allergies.\xa0People with atopic dermatitis often develop food allergies. One of the main symptoms of this condition is hives (urticaria).\n\nChronic itchy, scaly skin.\xa0A skin condition called neurodermatitis (lichen simplex chronicus) starts with a patch of itchy skin. You scratch the area, which provides only temporary relief. Scratching actually makes the skin itchier because it activ

In [None]:
# Start the interactive conversation -> Not working in fabric
agent = ConversationalAgent(document_manager=doc_manager)
agent.initiate_conversation()