In [2]:
import pandas as pd

SEED = 0
pd.set_option('display.max_colwidth', None)

In [3]:
# Load Data

In [4]:
path = 'data/preprocessed-ai-medical-chatbot.csv'
df = pd.read_csv(path)
df = df.dropna()

In [5]:
df.sample(5, random_state=SEED)

Unnamed: 0,description,question,answer
152573,caus red spot around eye child,hi 18month old suddenli becom faddi eat ok diet eat shreddi pot yogurt odd bit toast poorli around 15th april sick diarea week diet chang becam better develop littl red spot chin spread around face one disappear anoth appear somewher els prescrib caneston cream didnt realli anyth seem clear eat yogurt anyth moment manag get eat spot appear around eye could eat someth els,i donot think so its related to yogurt she is fond of eating as it does not spread and non itchy do you have mosquito at home apply some soothing cream moisturizer and that s all needed to your baby now
247419,skin color itchi bump elbow lower back temporari relief hydro cortison cream allergi pill histori season allergi done,skin color itchi bump elbow lower back itch like crazi notic last spring took long time heal month got spring get season allergi start take allergi pill itch went away rash clear longer season allergi need take alleri pill loratadin 10 mg rash back start take allergi pill help littl itch still bump still itch time bump red scratch like crazi tri hydrocortison cream well help also tri skinceut hydra balm well help littl itch live new mexico dri use oil lotion daili skin dri bump smooth touch blister,hi dear user thanks for choosing hcm u have seasonal allergy called allergic dermatitis so evaluate it consult good dermatologist thanq
71438,vomit one day pain appendix bladder,vomit sinc yesterday eat lunch mouth esophagu hurt stomach feel full addit sinc morn area appendix bladder hurt could strain muscl due vomit appendix,constipation when was your last bowel movement if you are clogged up it will have no where else to go but up chances are the pain in your belly by your bladder is a strained muscle so is your question about the strained muscle or the vomiting chances are you have a 24 hour flu or you ate something bad for breakfast
201206,liver cirrhosi enlarg spleen vein treat,hi name tammi multipl gallston hep c chorrosi liver enlarg spleen enlarg vein one live want touch want send bigger town better dr also want liver doner list hare get,hi and welcome to healthcaremagic thank you for your query i am dr rommstein i understand your concerns and i will try to help you as much as i can this is serious disease and quitting alcohol is the most important thing that should be done also medications which are hematotoxic should be avoided this is necessary to prevent disease progression which is lethal in most cases at early stages it can be treated with these measures but but in case of progression only liver transplantation may help diet should be balanced and healthy and get regular exercise limit high carb foods such as bread grits rice potatoes and corn and cut down on drinks with lots of sugar like sports drinks and juice if there is viral hepatitis as udnelying cause then antiviral medications are required i hope i have answered you query if you have any further questions you can contact us in every time kindly regards wish you a good health
40162,unabl control agit hallucin restless remedi,51 yr old femal w glioblastoma end stage morphine30mg sq q 4hr atc w btd 15 mg q 1hr prn decadron 4mg sq bid haldol 5mg sq q4hr atc ativan 1 mg q 4 hr atc still unabl control agit hallucin restless suggest,hithanks for using healthcare magici think she has hallucination due to metastasis in that case just carry on with haloperidol or you can try quetiapine or olanzapine organic psychosis is very difficult to treat and some time does not get improved with medication better to try any of these sedative antipsychotic at least with it she would remain calm thanks


# Building and training model

#### Split the data into train, test, and validation

In [6]:
from sklearn.model_selection import train_test_split

X = df[['description', 'question']]
y = df['answer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [7]:
print(f"Number of samples in train set: {X_train.shape[0]}")
print(f"Number of samples in test set: {X_test.shape[0]}")

Number of samples in train set: 231219
Number of samples in test set: 25691


#### Define the evalutaion function

In [15]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def eval_fn(preds, y):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    pred_embeddings = model.encode(preds)
    y_embeddings = model.encode(y)
    
    similarities = []
    for pred, ans in zip(pred_embeddings, y_embeddings):
        similarities.append(cosine_similarity([pred], [ans])[0][0])
    return sum(similarities) / len(similarities)

  from .autonotebook import tqdm as notebook_tqdm


### Random Guessing Model

In [64]:
# Pick a subsample of 25 for testing due to compute constraints
import random
random.seed(SEED)
preds = []
sample = 25
response_lst = list(y_train)
for x in X_test['description'][:sample]:
    random_answer = random.choice(response_lst)
    preds.append(random_answer)
y = list(y_test[:sample])

In [65]:
print(eval_fn(preds, y))

0.18797363623976707


#### Training model based on Probability

In [66]:
import math

class ProbabilityBasedAgent:
    
	def __init__(self, questions, responses):
		self.questions = questions
		self.responses = responses
		self.question_sets = []
		self.vocab = None

	def get_vocab(self):
		vocab = set()
		for question in self.questions:
			for word in question.split():
				vocab.add(word)
		return list(vocab)

	def prob_query_given_sentence(self, query, sentence_lst, alpha=1):
		query_lst = query.split()
		match = 0
		
		for token in query_lst:
			if token in sentence_lst:
				match += 1
		
		# Apply Laplace smoothing
		numerator = match + alpha
		denominator = len(query_lst) + alpha
		
		p = numerator / denominator
		return p

	def train(self):
		self.vocab = self.get_vocab()

		for question in self.questions:
			self.question_sets.append(set(question.split()))

	def find_closest_answer(self, query, k):
		
		probabilities_match = []
		for i in range(len(self.questions)):
			prob = self.prob_query_given_sentence(query, self.question_sets[i])
			probabilities_match.append((prob, self.questions[i], self.responses[i]))

		probabilities_match.sort(reverse=True)

		return probabilities_match[:k]

# Constrain size of train data to 20000 cause of compute restriction	
training_questions = list(X_train['description'])[:20000]
training_responses = list(y_train)[:20000]

# Constrain size of train data to 20000 cause of compute restriction	
training_questions = list(X_train['description'])[:20000]
training_responses = list(y_train)[:20000]

model = ProbabilityBasedAgent(training_questions, training_responses)
model.train()

In [67]:
user_query = "I have back pain, what should i do?"
responses = model.find_closest_answer(user_query, 3)

print("The following are some of the closest responses we could find:")

for sim, q, a in responses:
    
	print(f"Probability: {sim}")
	print(f"Question: {q}")
	print(f"Answer: {a}")
	print()

The following are some of the closest responses we could find:
Probability: 0.2222222222222222
Question: zit lower back pain pop stringi blood boxer caus
Answer: hello the blood was from the boil boil zit  is highly vascular due to the presence of inflammation 

Probability: 0.2222222222222222
Question: wysolon 20 intak back pain caus side effect
Answer: wysolone  prednisone  is a common short term treatment for inflammatory conditions such as a pinched nerve  if your father is otherwise healthy then this should be no problem  there can be an elevation in blood sugar while on the medication  but since it is only a few days  it is usually safe in otherwise relatively healthy people  if it helps  but then wears off fully  then a lumbar epidural steroid nerve block injection may well help 

Probability: 0.2222222222222222
Question: would painless lump back
Answer: hi   good evening  i am dr shareef answering your query  although a personal physical examination of the lump would be helpful

##### Evaluation on the test set

In [70]:
# Pick a subsample of 25 for testing due to compute constraints
preds = []
sample = 25
for x in X_test['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = list(y_test[:sample])

In [71]:
print(eval_fn(preds, y))

0.3824522036314011


##### Evaluation on the train set

In [74]:
# Pick a subsample of 25 from training due to compute constraints
preds = []
sample = 25
for x in X_train['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = list(y_train[:sample])

In [75]:
print(eval_fn(preds, y))

0.9646251106262207


#### Training Model Based On Cosine Similarity

In [1]:
import math

def cosine_similarity(user, ques):
    dot_product = sum(x*y for x, y in zip(user, ques))
    magnitude_user = sum(x*x for x in user)**0.5
    magnitude_ques = sum(y*y for y in ques)**0.5
    return dot_product / (magnitude_user * magnitude_ques)


class SimilarityBasedAgent:
    
	def __init__(self, questions, responses):
		self.questions = questions
		self.responses = responses
		self.vocab = None
		self.questions_vectors = None

	def get_vocab(self):
		vocab = set()
		for question in self.questions:
			for word in question.split():
				vocab.add(word)
		return list(vocab)
	
	def bag_of_words(self, question):
		vec = []
		for token in self.vocab:
			vec.append(question.count(token))
		return vec

	def train(self):
		self.vocab = self.get_vocab()
		vectors = []
		for question in self.questions:
			vectors.append(self.bag_of_words(question))
		self.questions_vectors = vectors

	def find_closest_answer(self, query, k):
		user_query_vector = self.bag_of_words(query)
		similarities = []
		for i in range(len(self.questions)):
			print(user_query_vector)
			print( self.questions_vectors[i])
			sim = cosine_similarity(user_query_vector, self.questions_vectors[i])
			similarities.append((sim, self.questions[i], self.responses[i]))

		similarities.sort(reverse=True)

		return similarities[:k]

# Constrain size of train data to 20000 cause of compute restriction	
training_questions = list(X_train['description'])[:20000]
training_responses = list(y_train)[:20000]

model = SimilarityBasedAgent(training_questions, training_responses)
model.train()

NameError: name 'X_train' is not defined

In [94]:
user_query = "I have back pain, what should i do?"
responses = model.find_closest_answer(user_query, 3)

print("The following are some of the closest responses we could find:")

for sim, q, a in responses:
    
	print(f"Similarity: {sim}")
	print(f"Question: {q}")
	print(f"Answer: {a}")
	print()

The following are some of the closest responses we could find:
Similarity: 0.7211102550927979
Question: back pain heal
Answer: hi and thanks for the query there are numerous causes of back pain  the treatment depends on the cause  trauma  infection   inflammation and tumors  arthrosis could also cause pain  the treatment could be simple pain killers  anti inflammatory drugs   steroids and at times surgery  i suggest you get a proper review from your doctor for proper blood work up and x rays to detect the exact cause and extent of the illness  kind regards

Similarity: 0.7139306476801298
Question: tight chest back neck shoulder pain indic
Answer: hello thanks for writing to us  i have studied your case with diligence as per your history there is possibility of rib contusion or costochondritis rib contusion is visible on x ray   i will advise to consult to orthopaedic surgeon medication like muscle relaxant and analgesic will reduce pain along with it use rib support belt you may consul

##### Evaluation on the test set

In [95]:
# Pick a subsample of 25 for testing due to compute constraints
preds = []
sample = 25
for x in X_test['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = list(y_test[:sample])

In [107]:
print(eval_fn(preds, y))

0.35789135545492173


##### Evaluation on the train set

In [None]:
# Pick a subsample of 25 from training due to compute constraints
preds = []
sample = 25
for x in X_train['description'][:sample]:
    print(x)
    print(model.find_closest_answer(x, 1))
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = list(y_train[:sample])

In [None]:
y = y_train[:sample]

print(eval_fn(preds, y))

1.0


### Training model based on Naive Bayes

In [10]:
X_train_np = np.array(X_train['description'])
X_test_np = np.array(X_test['description'])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_np)
X_test_tfidf = tfidf_vectorizer.transform(X_test_np)

In [12]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_tfidf[:50000], y_train[:50000])

In [13]:
predictions = naive_bayes_model.predict(X_test_tfidf[:25])


In [16]:
eval_fn(predictions, list(y_test[:25]))

0.2671047766506672