In [1]:
import pandas as pd

SEED = 0
pd.set_option('display.max_colwidth', None)

In [2]:
path = 'data/preprocessed-ai-medical-chatbot.csv'
df = pd.read_csv(path)
df = df.dropna()

In [3]:
df.sample(5, random_state=SEED)

Unnamed: 0,description,question,answer
152573,causes red spots around eyes child,hi 18month old suddenli becom faddi eat ok diet eat shreddi pot yogurt odd bit toast poorli around 15th april sick diarea week diet chang becam better develop littl red spot chin spread around face one disappear anoth appear somewher els prescrib caneston cream didnt realli anyth seem clear eat yogurt anyth moment manag get eat spot appear around eye could eat someth els,i donot think so its related to yogurt she is fond of eating as it does not spread and non itchy do you have mosquito at home apply some soothing cream moisturizer and that s all needed to your baby now
247419,skin colored itchy bumps elbow lower back temporary relief hydro cortisone cream allergy pills history seasonal allergies done,skin color itchi bump elbow lower back itch like crazi notic last spring took long time heal month got spring get season allergi start take allergi pill itch went away rash clear longer season allergi need take alleri pill loratadin 10 mg rash back start take allergi pill help littl itch still bump still itch time bump red scratch like crazi tri hydrocortison cream well help also tri skinceut hydra balm well help littl itch live new mexico dri use oil lotion daili skin dri bump smooth touch blister,hi dear user thanks for choosing hcm u have seasonal allergy called allergic dermatitis so evaluate it consult good dermatologist thanq
71438,vomiting one day pain appendix bladder,vomit sinc yesterday eat lunch mouth esophagu hurt stomach feel full addit sinc morn area appendix bladder hurt could strain muscl due vomit appendix,constipation when was your last bowel movement if you are clogged up it will have no where else to go but up chances are the pain in your belly by your bladder is a strained muscle so is your question about the strained muscle or the vomiting chances are you have a 24 hour flu or you ate something bad for breakfast
201206,liver cirrhosis enlarged spleen veins treated,hi name tammi multipl gallston hep c chorrosi liver enlarg spleen enlarg vein one live want touch want send bigger town better dr also want liver doner list hare get,hi and welcome to healthcaremagic thank you for your query i am dr rommstein i understand your concerns and i will try to help you as much as i can this is serious disease and quitting alcohol is the most important thing that should be done also medications which are hematotoxic should be avoided this is necessary to prevent disease progression which is lethal in most cases at early stages it can be treated with these measures but but in case of progression only liver transplantation may help diet should be balanced and healthy and get regular exercise limit high carb foods such as bread grits rice potatoes and corn and cut down on drinks with lots of sugar like sports drinks and juice if there is viral hepatitis as udnelying cause then antiviral medications are required i hope i have answered you query if you have any further questions you can contact us in every time kindly regards wish you a good health
40162,unable control agitation hallucinations restlessness remedy,51 yr old femal w glioblastoma end stage morphine30mg sq q 4hr atc w btd 15 mg q 1hr prn decadron 4mg sq bid haldol 5mg sq q4hr atc ativan 1 mg q 4 hr atc still unabl control agit hallucin restless suggest,hithanks for using healthcare magici think she has hallucination due to metastasis in that case just carry on with haloperidol or you can try quetiapine or olanzapine organic psychosis is very difficult to treat and some time does not get improved with medication better to try any of these sedative antipsychotic at least with it she would remain calm thanks


**Naive Bayes Classifier**

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [5]:
X = df['question']  # The patient's question
y = df['answer']  # The doctor's response (simplified for classification)

In [25]:
# Downsample the dataset for testing
df_small = df.sample(n=5000, random_state=42)
X = df_small['question']
y = df_small['answer']

# Vectorize with reduced features
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X_vec = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Try Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

# Example query to classify
query = "vomiting one day pain appendix bladder"
query_vec = vectorizer.transform([query])
predicted_answer = lr_classifier.predict(query_vec)
print(predicted_answer)


['hello  for pcod  polycystic ovarian disease   powder kalachikai has a good effect  but the duration of treatment is long  more than three months   for proper evaluation of disease  please go for usg whole abdomen and other investigations  if the size and number of cysts are more  then i suggest consulting a gynecologist as you may require surgery  if the number and size of cysts are less  you may continue treatment along with syrup evecare 10 ml bd and kanchnar guggulu two bd  for more information consult an ayurveda specialist online    ']


In [8]:
%pip install hmmlearn

Note: you may need to restart the kernel to use updated packages.


**Hidden Markov Model**

In [7]:
import numpy as np
from hmmlearn.hmm import GaussianHMM # type: ignore
from sklearn.decomposition import TruncatedSVD

In [8]:
# Vectorizing the text data using TfidfVectorizer (with dimensionality reduction)
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X_vec = vectorizer.fit_transform(X)

# Reduce dimensionality using TruncatedSVD (helps with performance)
svd = TruncatedSVD(n_components=100)  # Reduce to 100 components
X_svd = svd.fit_transform(X_vec)

# Example labels: Here we assume you are working with a small set of hidden states (just for example)
# In practice, labels should be related to a real task, like disease progression or other states
y = np.random.choice([0, 1, 2], size=len(X))  # Just an example (3 hidden states)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size=0.2, random_state=42)

# Define the Hidden Markov Model (GaussianHMM for continuous data)
model = GaussianHMM(n_components=3, covariance_type="full", random_state=42)

# Fit the model to the training data
model.fit(X_train)

# Predict hidden states for the test data
hidden_states = model.predict(X_test)

# Print the predicted hidden states
print("Predicted Hidden States (Disease Progression):")
print(hidden_states)

Predicted Hidden States (Disease Progression):
[2 0 2 0 2 2 0 2 0 2 1 0 2 0 0 2 1 2 1 2 0 1 0 2 0 1 1 2 1 2 0 0 2 1 1 1 0
 2 2 2 0 2 1 2 2 2 2 2 0 0 0 2 2 1 2 2 0 2 2 2 0 2 0 1 0 0 0 1 0 0 2 0 0 2
 2 1 2 2 0 1 1 1 2 1 0 2 0 1 2 1 2 2 2 0 2 0 1 2 2 2 1 2 2 0 0 1 1 2 0 2 2
 2 2 0 0 2 2 0 0 0 2 1 2 0 0 2 0 2 2 2 0 2 2 0 2 2 2 0 0 2 0 0 2 2 0 2 0 1
 2 2 0 0 2 2 1 2 2 0 0 2 2 0 0 2 0 0 1 0 2 0 2 1 0 0 2 1 1 2 2 0 1 2 2 2 2
 2 2 0 0 2 0 2 2 0 1 2 0 2 1 2 2 0 2 1 2 0 0 2 2 2 1 2 2 1 2 0 2 0 2 1 1 0
 0 0 0 0 0 2 2 0 2 2 1 0 2 2 0 0 2 0 2 1 0 0 1 2 2 2 2 0 0 2 0 2 2 2 0 2 0
 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 0 2 0 0 2 2 0 0 0 0 0 0 0 2 2 2 0 1 0 0 2 2
 2 2 0 0 2 2 2 0 0 0 2 2 0 0 1 2 1 0 0 0 1 0 2 0 0 0 1 0 0 0 2 2 2 2 2 0 2
 0 1 1 1 2 2 1 2 2 0 0 2 2 2 2 0 2 0 0 0 2 2 0 2 0 0 0 0 0 2 2 0 2 2 1 0 1
 0 0 2 2 0 2 2 2 2 2 1 0 2 2 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 2 2 0 1 0
 1 2 2 2 2 0 2 1 2 1 2 0 0 0 2 0 0 0 2 1 2 2 0 2 2 2 2 0 2 2 0 1 2 2 2 2 0
 2 2 2 1 2 0 0 2 2 0 0 2 2 2 2 2 2 2 0 1 1 0 0 0 0 1 

**Reinforcement Learning**

In [9]:
# Define the Q-learning parameters
alpha = 0.1   # Learning rate
gamma = 0.9   # Discount factor
epsilon = 0.1 # Exploration factor

# Define the state space (e.g., patient's condition: 'mild', 'moderate', 'critical')
state_space = ['mild', 'moderate', 'critical']

# Define the action space (e.g., treatments: 'rest', 'medicine', 'hospital')
action_space = ['rest', 'medicine', 'hospital']

# Initialize the Q-table with zeros
Q = np.zeros((len(state_space), len(action_space)))

# Define the reward matrix (based on expert knowledge or trial)
reward_matrix = {
    ('mild', 'rest'): 10,
    ('mild', 'medicine'): 5,
    ('mild', 'hospital'): -10,
    ('moderate', 'rest'): 5,
    ('moderate', 'medicine'): 10,
    ('moderate', 'hospital'): 0,
    ('critical', 'rest'): -10,
    ('critical', 'medicine'): 0,
    ('critical', 'hospital'): 20
}

# Function to get the action based on epsilon-greedy strategy
def choose_action(state_index):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(len(action_space))  # Explore
    else:
        return np.argmax(Q[state_index])  # Exploit

# Q-learning algorithm to train the agent
for episode in range(1000):  # Number of episodes
    state = np.random.choice(len(state_space))  # Randomly pick an initial state
    action = choose_action(state)  # Choose an action based on the state
    next_state = np.random.choice(len(state_space))  # Randomly pick the next state
    reward = reward_matrix[(state_space[state], action_space[action])]  # Get the reward

    # Update the Q-table using the Q-learning update rule
    Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

# Output the learned Q-table
print("Learned Q-table:")
print(Q)


Learned Q-table:
[[50.46909529 28.18618378 20.25618338]
 [45.87153367 33.11919138 15.53655679]
 [16.63633824 42.34596323 36.66732246]]


**Bayesian Network**

In [12]:
%pip install pgmpy

Note: you may need to restart the kernel to use updated packages.


In [10]:
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

In [26]:
# Define the structure of the Bayesian Network
# This is a simple model with nodes: Cough, Wheezing, Asthma

model = BayesianNetwork([('Cough', 'Asthma'), 
                         ('Wheezing', 'Asthma')])

# Define the Conditional Probability Distributions (CPDs)
# CPD for 'Cough' (no cause, just observed evidence)
cpd_cough = TabularCPD(variable='Cough', variable_card=2, 
                       values=[[0.7], [0.3]])  # 70% chance of not coughing, 30% coughing

# CPD for 'Wheezing' (no cause, just observed evidence)
cpd_wheezing = TabularCPD(variable='Wheezing', variable_card=2, 
                          values=[[0.8], [0.2]])  # 80% chance of not wheezing, 20% wheezing

# CPD for 'Asthma' (depends on Cough and Wheezing)
cpd_asthma = TabularCPD(variable='Asthma', variable_card=2, 
                        values=[[0.9, 0.6, 0.7, 0.1], 
                                [0.1, 0.4, 0.3, 0.9]], 
                        evidence=['Cough', 'Wheezing'], 
                        evidence_card=[2, 2])

# Add CPDs to the model
model.add_cpds(cpd_cough, cpd_wheezing, cpd_asthma)

# Check the model for correctness
model.check_model()

# Perform inference to predict asthma given observed symptoms (Cough and Wheezing)
inference = VariableElimination(model)

# Query the network: "What is the probability of Asthma given that the person is coughing and wheezing?"
query_result = inference.query(variables=['Asthma'], 
                               evidence={'Cough': 1, 'Wheezing': 1})  # 1 = observed (True)

# Print the result
print(query_result)

+-----------+---------------+
| Asthma    |   phi(Asthma) |
| Asthma(0) |        0.1000 |
+-----------+---------------+
| Asthma(1) |        0.9000 |
+-----------+---------------+


**Final Model: Cosine similarity**
**(Simulated Reinforcement Learning from patient feedback)**

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Load the dataset (preprocessed CSV)
path = 'data/preprocessed-ai-medical-chatbot.csv'
df = pd.read_csv(path)
df = df.dropna()

# Extract questions and answers
questions = df['question'].values
answers = df['answer'].values

# Vectorize the questions using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
question_vectors = vectorizer.fit_transform(questions)

# Function to calculate cosine similarity and retrieve the top 3 responses
def get_top_responses(patient_query, top_n=3):
    query_vector = vectorizer.transform([patient_query])
    similarity_scores = cosine_similarity(query_vector, question_vectors)
    
    # Get indices of the top N responses based on cosine similarity
    top_indices = np.argsort(similarity_scores[0])[-top_n:][::-1]
    
    # Return the top N responses and the corresponding indices
    return [answers[i] for i in top_indices], top_indices

# Reinforcement Learning (Simulating User Feedback for Optimization)
def simulate_user_feedback(response):
    # Simulate user feedback (1-5 scale, where 5 is most helpful)
    return random.randint(1, 5)

def rl_optimization(responses):
    best_response = None
    best_feedback = 0
    for response in responses:
        feedback = simulate_user_feedback(response)
        if feedback > best_feedback:
            best_feedback = feedback
            best_response = response
    return best_response

# Generalized function to get the top responses considering both HMM and RL
def get_best_responses(patient_query, top_n=3):
    # Retrieve top responses based on cosine similarity
    top_responses, top_indices = get_top_responses(patient_query, top_n)
    
    # Optimize the responses using RL (based on simulated feedback)
    optimized_response = rl_optimization(top_responses)
    
    return top_responses, optimized_response, top_indices  # Return top responses, optimized response, and indices

# Example patient query
patient_query = "vomiting one day pain appendix bladder"

# Get the top 3 responses
top_responses, optimized_response, top_indices = get_best_responses(patient_query, top_n=3)

# Print the results
print(f"Top 3 Responses:")
for i, response in enumerate(top_responses, start=1):
    print(f"{i}. {response}")

print(f"\nOptimized Response (based on RL feedback): {optimized_response}")


Top 3 Responses:
1. if it is acute appendicitis  operation might be necessary  it dependon the investigation findings  sometimes it can be controlled with medications also 
2. constipation  when was your last bowel movement   if you are clogged up it will have no where else to go but up  chances are the pain in your belly by your bladder is a strained muscle  so is your question about the strained muscle or the vomiting   chances are you have a 24 hour flu or you ate something bad for breakfast
3. hi there thanks for using hcm the symptoms of appendicitis are right lower abdominal pain  nausea  vomiting and fever when you press over the right lower abdomen   there would be severe tenderness the white cell count would be high as in your daughter s case the diagnosis can be confirmed with an ultrasound scan or a ct scan of the abdomen milder forms would resolve with medications severe forms may need removal of appendix  is this answer helpful 

Optimized Response (based on RL feedback): 

**Model Evaluation**

In [24]:
# Evaluation functions
def evaluate_model(patient_query, correct_answer, top_n=3):
    # Get the top responses and indices
    top_responses, optimized_response, top_indices = get_best_responses(patient_query, top_n)
    
    # Check if the correct answer is in the top N responses
    correct_in_top = correct_answer in top_responses
    precision_at_k = 1 if correct_in_top else 0
    
    # Simulate user feedback for each response
    feedback_scores = [simulate_user_feedback(response) for response in top_responses]
    average_feedback = np.mean(feedback_scores)
    
    # Output evaluation metrics
    print(f"Top {top_n} Responses: {top_responses}")
    print(f"Optimized Response (RL): {optimized_response}")
    print(f"Precision at {top_n}: {precision_at_k}")
    print(f"Average Simulated User Feedback: {average_feedback}")
    
    return precision_at_k, average_feedback

# Example patient query and correct answer
patient_query = "I have a sore throat, what should I do?"
correct_answer = "constipation when was your last bowel movement if you are clogged up it will have no where else to go but up chances are the pain in your belly by your bladder is a strained muscle so is your question about the strained muscle or the vomiting chances are you have a 24 hour flu or you ate something bad for breakfast"

# Evaluate the model on this query
evaluate_model(patient_query, correct_answer, top_n=3)


Top 3 Responses: ['hi  i have gone through the attachment  attachment removed to protect patient identity   it is oral stomatitis with pharyngitis  it can be due to smoking and alcohol infection  certain investigations like hb  hemoglobin   tlc  total leucocyte count   dlc  differential leucocyte count   and esr  erythrocyte sedimentation rate  can be done  it can be treated by taking oral vitamin b complex tablets with chlorhexidine mouthwash  avoid spicy foods and smoking  for more information consult an ent otolaryngologist online    ', 'hi  most of the times  a sore throat is due to a viral infection  this occurs with or without some amount of acid reflux into the throat from within the food pipe  this condition is called laryngopharyngeal reflux disease  lprd  and this may present simply with a sore throat and without any other symptoms of gastritis   i would  therefore  recommend antiseptic mouth gargles with chlorhexidine or dilute betadine  three times a day before and after me

(0, 2.6666666666666665)