## Chatbot from Scratch
A Chatbot for mental health support

Dataset source: https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data

In [1]:
import random
import json
import pickle
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD

In [2]:
with open("intents.json", 'r') as f:
    data = json.load(f)

In [3]:
type(data)

dict

In [4]:
data['intents'][0]

{'tag': 'greeting',
 'patterns': ['Hi',
  'Hey',
  'Is anyone there?',
  'Hi there',
  'Hello',
  'Hey there',
  'Howdy',
  'Hola',
  'Bonjour',
  'Konnichiwa',
  'Guten tag',
  'Ola'],
 'responses': ['Hello there. Tell me how are you feeling today?',
  'Hi there. What brings you here today?',
  'Hi there. How are you feeling today?',
  'Great to see you. How do you feel currently?',
  "Hello there. Glad to see you're back. What's going on in your world right now?"]}

In [5]:
# words_in_all_patterns: the vocabulary
words_in_all_patterns = []
all_tags = []
tokenized_patterns_and_their_tags = []
punctuations_to_remove = [',', '.', '?', '!', ';']

In [6]:
# Creating a list of all patterns tokenized and their tag
# and a list of all unique tags in the data
for dict in data['intents']:
    for pattern in dict['patterns']:
        tokenized_pattern = nltk.word_tokenize(pattern)
        words_in_all_patterns.extend(tokenized_pattern)
        tokenized_patterns_and_their_tags.append((tokenized_pattern, dict['tag']))
        
        if dict['tag'] not in all_tags:
            all_tags.append(dict['tag'])

In [7]:
tokenized_patterns_and_their_tags[100:105]

[(['I', "'m", 'so', 'anxious', 'because', 'of'], 'anxious'),
 (['I', 'do', "n't", 'want', 'to', 'talk', 'about', 'it', '.'], 'not-talking'),
 (['No', 'just', 'stay', 'away', '.'], 'not-talking'),
 (['I', 'ca', "n't", 'bring', 'myself', 'to', 'open', 'up', '.'],
  'not-talking'),
 (['Just', 'shut', 'up'], 'not-talking')]

In [8]:
len(tokenized_patterns_and_their_tags)

238

In [9]:
print(all_tags[:10])
print(len(all_tags))

['greeting', 'morning', 'afternoon', 'evening', 'night', 'goodbye', 'thanks', 'no-response', 'neutral-response', 'about']
80


In [10]:
len(all_tags)

80

In [11]:
words_in_all_patterns[:10]

['Hi', 'Hey', 'Is', 'anyone', 'there', '?', 'Hi', 'there', 'Hello', 'Hey']

In [12]:
len(words_in_all_patterns)

1203

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\durge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\durge\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
# Converting words to lower case, lemmatization and punctuation removal
words_in_all_patterns = [lemmatizer.lemmatize(word.lower()) 
                         for word in words_in_all_patterns 
                            if word not in punctuations_to_remove
                        ]
words_in_all_patterns[:10]

['hi', 'hey', 'is', 'anyone', 'there', 'hi', 'there', 'hello', 'hey', 'there']

In [17]:
len(words_in_all_patterns)  # Previous was 2406

1086

In [18]:
# Keeping only unique words
unique_words_in_all_patterns = []
for word in words_in_all_patterns:
    if word not in unique_words_in_all_patterns:
        unique_words_in_all_patterns.append(word)

unique_words_in_all_patterns[:10]

['hi',
 'hey',
 'is',
 'anyone',
 'there',
 'hello',
 'howdy',
 'hola',
 'bonjour',
 'konnichiwa']

In [19]:
len(unique_words_in_all_patterns)

296

### Converting Patterns to Vectors

In [20]:
tokenized_patterns_and_their_tags[100:105]

[(['I', "'m", 'so', 'anxious', 'because', 'of'], 'anxious'),
 (['I', 'do', "n't", 'want', 'to', 'talk', 'about', 'it', '.'], 'not-talking'),
 (['No', 'just', 'stay', 'away', '.'], 'not-talking'),
 (['I', 'ca', "n't", 'bring', 'myself', 'to', 'open', 'up', '.'],
  'not-talking'),
 (['Just', 'shut', 'up'], 'not-talking')]

In [21]:
unique_words_in_all_patterns[100:105]

['it', 'depressed', 'think', "'m", 'depression']

In [22]:
all_tags[:10]

['greeting',
 'morning',
 'afternoon',
 'evening',
 'night',
 'goodbye',
 'thanks',
 'no-response',
 'neutral-response',
 'about']

In [23]:
# Creating the training data
# Preprocessing patterns - converting words to lower case, lemmatization and puncuation removal (same as words_in_all_patterns)

training_data = []

for tokenized_pattern_and_tag_pair in tokenized_patterns_and_their_tags:
    bag_of_words_vec = [0] * len(unique_words_in_all_patterns)
    one_hot_tag_vec = [0] * len(all_tags)

    tokenized_pattern = tokenized_pattern_and_tag_pair[0]
    tag_of_pattern = tokenized_pattern_and_tag_pair[1]
    tokenized_pattern = [lemmatizer.lemmatize(word.lower()) 
                            for word in tokenized_pattern 
                                if word not in punctuations_to_remove
                        ]

    for word in tokenized_pattern:
        if word in unique_words_in_all_patterns:
            bag_of_words_vec[unique_words_in_all_patterns.index(word)] += 1
    
    
    # for word in unique_words_in_all_patterns:
    #     bag.append(1) if word in tokenized_pattern_and_tag_pair else bag.append(0)
    
    one_hot_tag_vec[all_tags.index(tag_of_pattern)] = 1
    training_data.append([bag_of_words_vec, one_hot_tag_vec])

In [24]:
len(training_data)

238

In [25]:
print(training_data[12])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Building the Neural Network Model
To generate tag for input pattern

Neural Network

In [26]:
model = Sequential()
model.add(Dense(128, input_shape = (len(bag_of_words_vec), ), activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(len(all_tags), activation = 'softmax'))

In [27]:
sgd_optimizer = SGD(learning_rate = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)

In [28]:
model.compile(loss = 'categorical_crossentropy', optimizer = sgd_optimizer, metrics = ['accuracy'])

In [29]:
X_train = [vectorized_pattern_tag_pair[0] for vectorized_pattern_tag_pair in training_data]
y_train = [vectorized_pattern_tag_pair[1] for vectorized_pattern_tag_pair in training_data]

In [30]:
model.fit(np.array(X_train), np.array(y_train), epochs = 80, batch_size = 10, verbose = 1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x1b67a445ca0>

### Dealing with User Input

In [31]:
# Preprocessing user input in same way as words_in_all_patterns and tokenized_patterns
def preprocess_user_input(user_input):
    words_in_user_input = nltk.word_tokenize(user_input)
    words_in_user_input = [ lemmatizer.lemmatize(word.lower()) 
                            for word in words_in_user_input 
                               if word not in punctuations_to_remove
                            ]
    return words_in_user_input

In [32]:
prompt = "I am very stressed."
preprocess_user_input(user_input = prompt)

['i', 'am', 'very', 'stressed']

In [33]:
unique_words_in_all_patterns[:10]

['hi',
 'hey',
 'is',
 'anyone',
 'there',
 'hello',
 'howdy',
 'hola',
 'bonjour',
 'konnichiwa']

In [34]:
len(unique_words_in_all_patterns)

296

In [35]:
def make_bag_of_words(user_input):
    words_in_cleaned_user_input = preprocess_user_input(user_input)
    bag_of_words = [0] * len(unique_words_in_all_patterns)
    
    # w: word in user input
    # word: word in the list of all words
    for word in (words_in_cleaned_user_input):
        if word in unique_words_in_all_patterns:
            bag_of_words[unique_words_in_all_patterns.index(word)] += 1

    # for w in words_in_cleaned_user_input:
    #     for i, word in enumerate(words_in_all_patterns):
    #         if word == w:
    #             bag[i] = 1
    
    return np.array(bag_of_words)

In [36]:
vectorized_prompt = make_bag_of_words(user_input = prompt)
vectorized_prompt

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [37]:
model_output = model.predict(np.array([vectorized_prompt]), verbose = 0)
model_output

array([[4.81932511e-05, 1.66534679e-04, 5.06788747e-05, 2.76808623e-05,
        6.19347324e-04, 4.75378192e-06, 9.43166378e-06, 1.89419996e-04,
        2.72493577e-04, 9.25365930e-06, 8.61010540e-06, 2.68264248e-05,
        4.91533428e-03, 2.16025164e-05, 5.10134362e-02, 9.00073230e-01,
        2.34287675e-03, 5.89044951e-03, 1.48207238e-02, 8.40683235e-04,
        4.88341693e-03, 8.11044883e-05, 5.10135083e-04, 2.47568882e-04,
        4.85489145e-05, 2.89342938e-06, 5.46387600e-05, 9.54167219e-04,
        1.44542457e-04, 6.46095759e-06, 1.02226331e-04, 1.56241615e-04,
        7.26857763e-07, 7.69055532e-06, 5.76134084e-07, 2.59512558e-06,
        1.62902623e-04, 4.05896601e-04, 2.12580402e-04, 4.63439705e-04,
        1.03411812e-03, 7.59156886e-04, 1.19172955e-05, 8.67500421e-06,
        1.62269163e-03, 9.45456617e-04, 1.53980695e-03, 1.18852258e-04,
        7.26772560e-05, 7.42820976e-06, 2.42069636e-05, 8.41053625e-05,
        1.21597748e-03, 1.31726638e-05, 1.81982497e-04, 7.880476

In [38]:
len(model_output[0]), len(all_tags)

(80, 80)

In [39]:
np.where(model_output[0] == max(model_output[0]))[0][0]

15

In [40]:
all_tags[15]

'stressed'

In [41]:
def get_tag_for_user_input(user_input):
    vectorized_user_input = make_bag_of_words(user_input)
    model_output = model.predict(np.array([vectorized_user_input]), verbose = 0)

    max_probability = max(model_output[0])
    
    if max_probability > 0.25:
        predicted_tag = all_tags[np.where(model_output[0] == max_probability)[0][0]]
        return predicted_tag
    
    else:
        return None

In [42]:
predicted_tag_for_prompt = get_tag_for_user_input(user_input = prompt)
print(predicted_tag_for_prompt)

stressed


In [43]:
def get_chatbot_response(predicted_tag):
    if predicted_tag == None:
        return "Sorry. I do not understand."
    else:
        for dict in data['intents']:
            if dict["tag"] == predicted_tag:
                return random.choice(dict["responses"])

In [44]:
get_chatbot_response(predicted_tag = predicted_tag_for_prompt)

'Take a deep breath and gather your thoughts. Go take a walk if possible. Stay hydrated'

### Using the Chatbot

In [51]:
print("Chatbot is running!")
while True:
    prompt = input("User:")
    print("User:", prompt)
    predicted_tag_for_prompt = get_tag_for_user_input(prompt)
    response = get_chatbot_response(predicted_tag = predicted_tag_for_prompt)
    print("Chatbot:", response)
    print()

Chatbot is running!
User: Good Morning
Chatbot: Good morning. I hope you had a good night's sleep. How are you feeling today? 

User: I am feeling stressed
Chatbot: I am sorry to hear that. What is the reason behind this?

User: My exams are coming and I feel like I am not prepared
Chatbot: I see. Have you taken any approaches to not feel this way?

User: Yes. I try to do deep breathing to calm myself down but it doesn't seem to work. Can you help me?
Chatbot: I understand how you feel. Don't put yourself down because of it.



KeyboardInterrupt: Interrupted by user