In [1]:
import pandas as pd

SEED = 0
pd.set_option('display.max_colwidth', None)

In [2]:
# Load Data

In [3]:
path = 'data/preprocessed-ai-medical-chatbot.csv'
df = pd.read_csv(path)
df = df.dropna()

In [4]:
df.sample(5, random_state=SEED)

ValueError: Cannot take a larger sample than population when 'replace=False'

# Building and training model

#### Split the data into train, test, and validation

In [5]:
from sklearn.model_selection import train_test_split

X = df[['description', 'question']]
y = df['answer']

X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.1, random_state=42)


KeyError: "None of [Index(['description', 'question'], dtype='object')] are in the [columns]"

In [79]:
print(f"Number of samples in train set: {X_train.shape[0]}")
print(f"Number of samples in validation set: {X_val.shape[0]}")
print(f"Number of samples in test set: {X_test.shape[0]}")

Number of samples in train set: 208097
Number of samples in validation set: 23122
Number of samples in test set: 25691


#### Training model based on Probability

In [181]:
import math

class ProbabilityBasedAgent:
    
	def __init__(self, questions, responses):
		self.questions = questions
		self.responses = responses
		self.question_sets = []
		self.vocab = None

	def get_vocab(self):
		vocab = set()
		for question in self.questions:
			for word in question.split():
				vocab.add(word)
		return list(vocab)

	def prob_query_given_sentence(self, query, sentence_lst, alpha=1):
		query_lst = query.split()
		match = 0
		
		for token in query_lst:
			if token in sentence_lst:
				match += 1
		
		# Apply Laplace smoothing
		numerator = match + alpha
		denominator = len(query_lst) + alpha
		
		p = numerator / denominator
		return p

	def train(self):
		self.vocab = self.get_vocab()

		for question in self.questions:
			self.question_sets.append(set(question.split()))

	def find_closest_answer(self, query, k):
		
		probabilities_match = []
		for i in range(len(self.questions)):
			prob = self.prob_query_given_sentence(query, self.question_sets[i])
			probabilities_match.append((prob, self.questions[i], self.responses[i]))

		probabilities_match.sort(reverse=True)

		return probabilities_match[:k]

# Constrain size of train data to 20000 cause of compute restriction	
training_questions = list(X_train['description'])[:20000]
training_responses = list(y_train)[:20000]

# Constrain size of train data to 20000 cause of compute restriction	
training_questions = list(X_train['description'])[:20000]
training_responses = list(y_train)[:20000]

model = ProbabilityBasedAgent(training_questions, training_responses)
model.train()

In [182]:
user_query = "I have back pain, what should i do?"
responses = model.find_closest_answer(user_query, 3)

print("The following are some of the closest responses we could find:")

for sim, q, a in responses:
    
	print(f"Probability: {sim}")
	print(f"Question: {q}")
	print(f"Answer: {a}")
	print()

The following are some of the closest responses we could find:
Probability: 0.2222222222222222
Question: x ray result back indicate
Answer: hi showing loss of curvature and straightening of spine in elderly might be due to degenerative and osteo artheritis in spines but as you are young this might be due to faulty posture of sitting this can be due to long time sitting in front of computer or table work correct your posture sit straight and avoid bending from back while doing work go for back extension exercises daily ok and take care 

Probability: 0.2222222222222222
Question: would painless lump back
Answer: hi   good evening  i am dr shareef answering your query  although a personal physical examination of the lump would be helpful in reaching a nearby diagnosis  most probably with a history of trauma  it could be an organised hematoma on the left side of your back  to confirm the diagnosis  if i were your doctor  i would go for an fnac  fine needle aspiration cytology  followed pos

In [None]:
def eval_fn(preds, y):
    def overlap(pred, answer):
        pred_words = set(pred.lower().split())
        answer_words = set(answer.lower().split())
        overlap = pred_words.intersection(answer_words)
        return len(overlap) / len(answer_words) if answer_words else 0

    scores = [overlap(pred, answer) for pred, answer in zip(preds, y)]
    scores.sort()
    return scores[len(scores) // 2]

##### Evaluation on the test set

In [183]:
# Pick a subsample of 25 for testing due to compute constraints
preds = []
sample = 25
for x in X_test['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = y_test[:sample]

In [184]:
print(eval_fn(preds, y))

0.22535211267605634


##### Evaluation on the train set

In [186]:
# Pick a subsample of 25 from training due to compute constraints
preds = []
sample = 25
for x in X_train['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = y_train[:sample]

In [187]:
print(eval_fn(preds, y))

1.0


#### Training Model Based On Cosine Similarity

In [140]:
import math

def cosine_similarity(user, ques):
    dot_product = sum(x*y for x, y in zip(user, ques))
    magnitude_user = sum(x*x for x in user)**0.5
    magnitude_ques = sum(y*y for y in ques)**0.5
    return dot_product / (magnitude_user * magnitude_ques)


class SimilarityBasedAgent:
    
	def __init__(self, questions, responses):
		self.questions = questions
		self.responses = responses
		self.vocab = None
		self.questions_vectors = None

	def get_vocab(self):
		vocab = set()
		for question in self.questions:
			for word in question.split():
				vocab.add(word)
		return list(vocab)
	
	def bag_of_words(self, question):
		vec = []
		for token in self.vocab:
			vec.append(question.count(token))
		return vec

	def train(self):
		self.vocab = self.get_vocab()
		vectors = []
		for question in self.questions:
			vectors.append(self.bag_of_words(question))
		self.questions_vectors = vectors

	def find_closest_answer(self, query, k):
		user_query_vector = self.bag_of_words(query)
		
		similarities = []
		for i in range(len(self.questions)):
			sim = cosine_similarity(user_query_vector, self.questions_vectors[i])
			similarities.append((sim, self.questions[i], self.responses[i]))

		similarities.sort(reverse=True)

		return similarities[:k]

# Constrain size of train data to 20000 cause of compute restriction	
training_questions = list(X_train['description'])[:20000]
training_responses = list(y_train)[:20000]

model = SimilarityBasedAgent(training_questions, training_responses)
model.train()

In [None]:
user_query = "I have back pain, what should i do?"
responses = model.find_closest_answer(user_query, 3)

print("The following are some of the closest responses we could find:")

for sim, q, a in responses:
    
	print(f"Similarity: {sim}")
	print(f"Question: {q}")
	print(f"Answer: {a}")
	print()

The following are some of the closest responses we could find:
Similarity: 0.7456785876210714
Question: causes sharp pain shoulders back neck thighs
Answer: hello and welcome to  ask a doctor  service i have reviewed your query and here is my advice in my opinion  the symptoms point towards disc prolapse which needs to be evaluated with x ray and mri  hope i have answered your query  let me know if i can assist you further regards  dr  fahim sheik

Similarity: 0.7172191381865586
Question: causes sharp shooting pain back head
Answer: hi thanks for writing in to us there are many causes of a shooting pain in the back of the head  commonly a neuralgia of the trigeminal nerve can cause such a condition  it has branches which supplies the back of the head  this can trigger sharp pains to the area  however a clinical examination is required to confirm the condition other causes of such pains can be from nerve pinching in the cervical spine  the nerves leave the spinal cord through small fora

##### Evaluation on the test set

In [168]:
# Pick a subsample of 25 for testing due to compute constraints
preds = []
sample = 25
for x in X_test['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = y_test[:sample]

In [169]:
print(eval_fn(preds, y))

0.25

##### Evaluation on the train set

In [None]:
# Pick a subsample of 25 from training due to compute constraints
preds = []
sample = 25
for x in X_train['description'][:sample]:
    sim, q, a = model.find_closest_answer(x, 1)[0]
    preds.append(a)
y = y_train[:sample]

In [173]:
y = y_train[:sample]

print(eval_fn(preds, y))

1.0
