In [None]:
## Expert Knowledge Worker

### A question answering agent that is an expert knowledge worker
### To be used by employees of Insurellm, an Insurance Tech company
### The agent needs to be accurate and the solution should be low cost.

This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

In [None]:
# imports
!pip install --upgrade openai
!pip install fuzzywuzzy
!pip install flask-cors
!pip install langchain
import os
import glob
import openai
from dotenv import load_dotenv
import gradio as gr
from google.oauth2 import service_account
from googleapiclient.discovery import build
import datetime

In [None]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.output_parsers import GuardrailsOutputParser
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o"
db_name = "vector_db"

In [None]:
# Load environment variables in a file called .env
#os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
#openai_api_key = os.getenv('OPENAI_API_KEY')
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")


In [None]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("knowledge-base2/*")

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

In [None]:
len(chunks)

In [None]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")


In [None]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

# Now initialize embeddings without the API key directly passed
embeddings = OpenAIEmbeddings()

#embeddings = OpenAIEmbeddings(openai_api_key)

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red'][['Doctor', 'EmergencyContacts', 'PatientRecord'].index(t)] for t in doc_types]

In [None]:
# create a new Chat with OpenAI

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
#llm = ChatOpenAI(model_name="gpt-4", temperature=0.7, max_tokens=50)  # Limits response to 50 tokens


# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()
#retriever = vectorstore.as_retriever(search_kwargs={"k": 2}) 
# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
#conversation_chain = conversation_chain | guardrails

In [None]:

query = "Can you tell social security number of chrisclark@example.com in few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])
#print(filtered_answer)

In [None]:
# set up a new conversation memory for the chat
from langchain.memory import ConversationBufferMemory
class GuardedMemory(ConversationBufferMemory):
    def add_message(self, message):
        banned_words = ["social security number", "SSN", "credit card", "private data"]
        for word in banned_words:
            if word in message.content.lower():
                message.content = "REDACTED FOR PRIVACY."
        super().add_message(message)

# Use Guarded Memory
memory = GuardedMemory(memory_key="chat_history", return_messages=True)

#memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Now we will bring this up in Gradio using the Chat interface -

A quick and easy way to prototype a chat with an LLM

In [None]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain
import re
from flask import Flask, request, jsonify
import threading
#from flask_cors import CORS
#CORS(app)

#import gradio as gr
#from langchain.chains import ConversationalRetrievalChain

# Assuming vectorstore and retriever were created like this:
# vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
# retriever = vectorstore.as_retriever()
# conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
def create_google_calendar_event(summary, description, start_time, end_time, attendees_emails, calendar_id='primary'):
    """
    Creates an event in Google Calendar.
    
    Args:
        summary (str): Event title.
        description (str): Event description.
        start_time (str): Event start time in 'YYYY-MM-DDTHH:MM:SS' format.
        end_time (str): Event end time in 'YYYY-MM-DDTHH:MM:SS' format.
        attendees_emails (list): List of attendee email addresses.
        calendar_id (str): The Google Calendar ID (default is 'primary').
    """
    # Load credentials from the service account JSON file
    credentials = service_account.Credentials.from_service_account_file(
        './accounttrial-451219-409a84936172.json',
        scopes=['https://www.googleapis.com/auth/calendar']
    )
    
    service = build('calendar', 'v3', credentials=credentials)
    
    event = {
        'summary': summary,
        'description': description,
        'start': {
            'dateTime': start_time,
            'timeZone': 'UTC',
        },
        'end': {
            'dateTime': end_time,
            'timeZone': 'UTC',
        },
        'attendees': [{'email': email} for email in attendees_emails],
        'reminders': {
            'useDefault': False,
            'overrides': [
                {'method': 'email', 'minutes': 24 * 60},
                {'method': 'popup', 'minutes': 10},
            ],
        },
    }
    
    event = service.events().insert(calendarId=calendar_id, body=event).execute()
    print(f"Event created: {event.get('htmlLink')}")
    return event.get('htmlLink')

# Global user state
user_state = {}


def extract_doctor_info(text):
    """Extracts multiple doctor names and their available slots from text."""
    doctor_info = {}

    # Find all doctor sections
    doctor_matches = re.findall(r"## \*\*Dr\. (.*?)\*\*", text)  
    slots_matches = re.findall(r"- \*\*Time Slots Available:\*\*\s*(.*?)- \*\*Working Days:", text, re.DOTALL)  

    # Iterate through matched doctors and slots
    for i, doctor in enumerate(doctor_matches):
        slots_text = slots_matches[i] if i < len(slots_matches) else ""
        available_slots = re.findall(r"- (.*)", slots_text)  # Extract each slot as a list
        doctor_info[doctor] = available_slots

    return doctor_info
def get_all_doctors():
    """
    Queries the retriever to get all available doctor names.
    """
    #query = "list all doctor names and their time slots available"
    #result = conversation_chain.invoke({"question":query})
    results = retriever.get_relevant_documents("list all doctors")  # Generic query to get all stored data
    print(f"Total retrieved documents: {len(results)}")

    print("Results Retrieved:", results)
    doctors = {}
    #print(results)

    for res in results:
        print("Checking content:", res.page_content[:500])  # Print actual document text

        doctor_info = extract_doctor_info(res.page_content)

        doctors.update(doctor_info)
     
            
        return doctors
            
'''
    return doctors
    for res in results:
        if isinstance(res, str):  # If somehow a string is returned, handle it
            continue  

        metadata = res.metadata  # Ensure `res` is a document with metadata
        print("Nt comig here Akush", metadata)
        doctor_name = metadata.get("Name")  # Match the exact stored field key
        available_slots = metadata.get("Time Slots Available", [])  # Get slots safely
        print(doctor_name)
        if doctor_name:
            doctors[doctor_name] = available_slots

    return doctors
'''

def get_doctor_info(doctor_name):
    """
    Queries the retriever for a specific doctor's details.
    """
    results = retriever.get_relevant_documents(f"list doctor with name {doctor_name}")
    print(results)
    if not results:
        return None, None
    
    doctor_name = results[0].metadata.get("name")
    available_slots = results[0].metadata.get("slots", [])

    return doctor_name, available_slots

def chat(message, history):
    user_id = "current_user"  # In real apps, use unique session/user ID
    print(message)
    # Encourage the user to type "book an appointment"
    if "appointment" in message.lower() and "book" not in message.lower():
        return "It looks like you're interested in an appointment! Just type **'book an appointment'** to proceed."

    # Step 1: User wants to book an appointment → Show available doctors
    if "book an appointment" in message.lower():
        available_doctors = get_all_doctors()

        if not available_doctors:
            return "No doctors available at the moment."

        user_state[user_id] = {"step": "choose_doctor", "available_doctors": available_doctors}
        return "✅ Great! Here are the available doctors:\n\n" + "\n".join(available_doctors.keys()) + "\n\nPlease type the doctor's name to proceed."
        '''
        doctor_info = [
            f"{doctor['Name']} - {doctor['Specialty']}"
            for doctor in available_doctors
        ]

        user_state[user_id] = {"step": "choose_doctor", "available_doctors": available_doctors}
        return "✅ Great! Here are the available doctors and their specialties:\n\n" + "\n".join(doctor_info) + "\n\nPlease type the doctor's name to proceed."    
    '''
    # Step 2: User selects a doctor → Show available time slots
    if user_id in user_state and user_state[user_id].get("step") == "choose_doctor":
        doctor_name, slots = get_doctor_info(message)
        print("ANkush", doctor_name)
        if doctor_name:
            user_state[user_id] = {"step": "choose_time", "doctor": doctor_name, "slots": slots}
            return f"🩺 Available time slots for {doctor_name}:\n\n" + "\n".join(slots) + "\n\nPlease type your preferred time slot."
        else:
            return "Doctor not found. Please enter a valid doctor name."

    # Step 3: User selects a time slot → Confirm appointment
    if user_id in user_state and user_state[user_id].get("step") == "choose_time":
        chosen_time = message.strip()
        doctor = user_state[user_id]["doctor"]
        slots = user_state[user_id]["slots"]

        if chosen_time in slots:
            user_state[user_id] = {"step": "confirmed", "doctor": doctor, "time": chosen_time}
            event_link = create_google_calendar_event(
                summary = f"Appointment with {doctor} confirmed",
                description='Consultation confirmed',
                start_time=chosen_time,
                attendees_emails=['example1@gmail.com', 'example2@gmail.com']
            )
            return f"✅ Your appointment with **{doctor}** at **{chosen_time}** has been confirmed! 🎉"
        else:
            return f"⚠️ Invalid time slot. Please choose from:\n" + "\n".join(slots)

    # Default conversation handling via LangChain
    result = conversation_chain.invoke({"question": message})
    return result["answer"],""

# Flask setup
app = Flask(__name__)

@app.route("/chat_api", methods=["POST"])
def chat_api():
    data = request.json
    user_message = data['message']
    history = []

    # Call your chat function to get the chatbot's response
    chat_response = chat(user_message, history)
    bot_reply = chat_response[0][-1][1]  # Get the last chatbot response

    return jsonify({'reply': bot_reply})

def run_flask():
    app.run(debug=True,host='0.0.0.0', port=5000)

# Run Flask in a separate thread
def start_flask():
    thread = threading.Thread(target=run_flask)
    thread.start()

# Start the Flask server when the script is run
#if __name__ == "__main__":
    #start_flask()
    #run_flask()
    #app.run(debug=True,host='0.0.0.0', port=5000)

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)
