## ChatBot - MedBot

- Problem Statement: Develop a ChatBot for Medical queries of Patient

In [1]:
# import Libraries
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle
import string

import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import random

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
# Loading intents data 
words=[]
classes = []
documents = []
ignore_words = ['?', '!',',','.']
data_file = open('intents.json').read()
intents = json.loads(data_file)

In [3]:
intents['intents']

[{'tag': 'greetings',
  'patterns': ['hello',
   'hey',
   'hi',
   'good day',
   'greetings',
   "what's up?",
   'how is it going'],
  'responses': ['hello', 'hey!', 'what can i do for you?']},
 {'tag': 'goodbye',
  'patterns': ['cya',
   'see you later',
   'goodbye',
   'have a good day',
   'bye',
   'cao',
   'see ya'],
  'responses': ['have a nice day', 'goodbye']},
 {'tag': 'age',
  'patterns': ['how old',
   'how old are you?',
   'what is your age',
   'how old are you',
   'age?'],
  'responses': ['I get reborn after every compilation',
   'hey!',
   'my owners are averagely 20 years!']},
 {'tag': 'name',
  'patterns': ['what is your name',
   'what should i call you',
   "what's your name?",
   'who are you?',
   'can you tell me your name'],
  'responses': ['you can call me Medbot!',
   'i am Medbot!',
   'i am Medbot your medical assistant']},
 {'tag': 'common cold symptoms',
  'patterns': ['Runny or stuffy nose',
   'Sore throat',
   'Cough',
   'Congestion',
   'Slight

### Data Cleaning

In [4]:
# Extracting list of all words and documents - combination of sentence pattern and tags
for intent in intents['intents']:
    for pattern in intent['patterns']:

        w = nltk.word_tokenize(pattern)
        words.extend(w)
        documents.append((w, intent['tag']))

        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [5]:
classes

['greetings',
 'goodbye',
 'age',
 'name',
 'common cold symptoms',
 'fever symptoms',
 'Diabetes symptoms',
 'Depression symptoms',
 'Asthma symptoms',
 'common cold prevention',
 'fever prevention',
 'diabetes prevention',
 'depression prevention',
 'asthma prevention',
 'Consultation']

### Data Modelling

In [6]:
# lemmaztize and lower each word and remove duplicates
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print (len(documents), "documents")
# classes = intents
print (len(classes), "classes", classes)
# words = all words, vocabulary
print (len(words), "unique lemmatized words", words)


84 documents
15 classes ['Asthma symptoms', 'Consultation', 'Depression symptoms', 'Diabetes symptoms', 'age', 'asthma prevention', 'common cold prevention', 'common cold symptoms', 'depression prevention', 'diabetes prevention', 'fever prevention', 'fever symptoms', 'goodbye', 'greetings', 'name']
134 unique lemmatized words ["'s", '(', ')', 'a', 'ache', 'age', 'am', 'and', 'anxiety', 'any', 'appetite', 'are', 'asthma', 'at', 'available', 'blurry', 'body', 'breath', 'buy', 'bye', 'call', 'can', 'cao', 'change', 'chest', 'chill', 'cold', 'common', 'congestion', 'consultation', 'contact', 'cough', 'coughing', 'cya', 'day', 'death', 'dehydration', 'depression', 'diabetes', 'difficulty', 'doctor', 'drink', 'eat', 'extreme', 'fatigue', 'feeling', 'fever', 'flu', 'for', 'frequent', 'from', 'general', 'generally', 'getting', 'give', 'going', 'good', 'goodbye', 'greeting', 'have', 'headache', 'hello', 'help', 'hey', 'hi', 'hopeless', 'how', 'hunger', 'i', 'if', 'in', 'increased', 'interest', 

In [7]:
len(words)

134

#### Train and Test Split

In [8]:
df = []
# create an empty array for our output
output_empty = [0] * len(classes)
# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    
    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    
    df.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(df)
df = np.array(df)
# create train and test lists. X - patterns, Y - intents
X_train, X_test, y_train, y_test = train_test_split(list(df[:,0]),list(df[:,1]),test_size=.3, random_state=1)
print("Training data created")


Training data created


### Model Development - CNN based model

In [9]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(X_train[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(y_train[0]), activation='softmax'))

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model 
model.fit(np.array(X_train), np.array(y_train), epochs=200, batch_size=3, verbose=1)


print("model created")

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155

Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
model created


In [23]:
# Train prediction
res = model.predict(np.array(X_train))

In [24]:
def Accuracy(actual,pred):
    
    

    X=[]
    for p in actual:
    
        X.extend([classes[i] for i,r in enumerate(p) if r==1])
    
    results=[]
    for p in pred:
    
        results.extend([classes[i] for i,r in enumerate(p) if r== max(p)])
    
    a =[1 if i==j else 0 for i,j in zip(X,results) ]
        
    acc=sum(a)*100/len(a)
    
    return(acc) 
        
    

In [25]:
# Accuracy of Train data 
Accuracy(y_train,res)

100.0

In [26]:
# Test Prediction
res = model.predict(np.array(X_test))

In [27]:
# Accuracy of Test data 
Accuracy(y_test,res)

26.923076923076923

In [28]:
# functions for chat message for Data cleaning, modelling, labeling, and prediction and chat response

def bow(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)
    return(np.array(bag))

def predict_class(sentence, model):
    # filter out predictions below a threshold
    p = bow(sentence, words,show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list


def chatbot_response(msg):
    ints = predict_class(msg, model)
    if ints==[]:
        res='How can I help you?'
    else:
        
        
        tag = ints[0]['intent']
        list_of_intents = intents['intents']
        for i in list_of_intents:
            if(i['tag']== tag):
                res = random.choice(i['responses'])
            
                break
            
    return res


In [29]:
#Creating GUI with tkinter
import tkinter
from tkinter import *


def send():
    msg = EntryBox.get("1.0",'end-1c').strip()
    EntryBox.delete("0.0",END)

    if msg != '':
        ChatLog.config(state=NORMAL)
        ChatLog.insert(END, "You: " + msg + '\n\n')
        ChatLog.config(foreground="#442265", font=("Verdana", 12 ))
    
        res = chatbot_response(msg) 
        ChatLog.insert(END, "Bot: " + res + '\n\n')
            
        ChatLog.config(state=DISABLED)
        ChatLog.yview(END)
 



In [30]:
base = Tk()
base.title("Medbot")
base.geometry("400x500")
base.resizable(width=FALSE, height=FALSE)

#Create Chat window
ChatLog = Text(base, bd=0, bg="white", height="8", width="50", font="Arial",)

ChatLog.config(state=DISABLED)

#Bind scrollbar to Chat window
scrollbar = Scrollbar(base, command=ChatLog.yview)
ChatLog['yscrollcommand'] = scrollbar.set

#Create Button to send message
SendButton = Button(base, font=("Verdana",12,'bold'), text="Send", width="12", height=5,
                    bd=0, bg="#32de97", activebackground="#3c9d9b",fg='#ffffff',
                    command= send )


In [31]:

#Create the box to enter message
EntryBox = Text(base, bd=0, bg="white",width="29", height="5", font="Arial")
#EntryBox.bind("<Return>", send)


#Place all components on the screen
scrollbar.place(x=376,y=6, height=386)
ChatLog.place(x=6,y=6, height=386, width=370)
EntryBox.place(x=128, y=401, height=90, width=265)
SendButton.place(x=6, y=401, height=90)



In [32]:
base.mainloop()

Conclusion : Chat designed for 15 claases using small samples of data, currently the model is overfitting due to less amount of 
             data and class Imbalance.