# Chatbot Application
by A4Ayub Notebooks (http://www.a4ayub.me)

## Introduction

In this NLP AI application, we build the core conversational engine for a chatbot. We use the popular NLTK text classification library to achieve this.

## Workbench

In [3]:
# import the necessary libraries
import nltk
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [2]:
# using the nltk gui download and install any of the packages you will need
nltk.download_gui()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [4]:
## Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [5]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Preprocess

In [6]:
sentence = "The Big brown fox jumped over a lazy dog."
sentence2 = "This is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

In [7]:
#convert sentence to lower case
'This' == 'this'
print('AbcdEFgH'.lower())
sentence.lower()
sentence2.lower()

False

abcdefgh


'the big brown fox jumped over a lazy dog.'

"this is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

### Tokenization

In [8]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens
tokens2 = tokenizer.tokenize(sentence2)
tokens2

['The', 'Big', 'brown', 'fox', 'jumped', 'over', 'a', 'lazy', 'dog']

['This',
 'is',
 'particularly',
 'important',
 'in',
 'today',
 's',
 'world',
 'where',
 'we',
 'are',
 'swamped',
 'with',
 'unstructured',
 'natural',
 'language',
 'data',
 'on',
 'the',
 'variety',
 'of',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'in',
 'now',
 'a',
 'days',
 'note',
 'now',
 'a',
 'days',
 'in',
 'the',
 'decade',
 'of',
 '2010',
 '2020']

### Stopwords

In [9]:
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
filtered_words

['The', 'Big', 'brown', 'fox', 'jumped', 'lazy', 'dog']

In [10]:
filtered_words = [w for w in tokens2 if not w in stopwords.words('english')]
filtered_words

['This',
 'particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

In [11]:
# Putting everything together in a function
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

In [12]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']


In [13]:
preprocess(sentence2)

['particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

### POS Tagging

In [14]:
tags = nltk.pos_tag(preprocessed_sentence)
print(tags)

[('big', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN')]


In [15]:
tags = nltk.pos_tag(preprocess(sentence2))
print(tags)

[('particularly', 'RB'), ('important', 'JJ'), ('today', 'NN'), ('world', 'NN'), ('swamped', 'VBD'), ('unstructured', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('variety', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('platforms', 'NNS'), ('people', 'NNS'), ('engage', 'VBP'), ('days', 'NNS'), ('note', 'VBP'), ('days', 'NNS'), ('decade', 'NN'), ('2010', 'CD'), ('2020', 'CD')]


#### Extracting Nouns and Verb Nouns

POS tag list:

1. CC coordinating conjunction
2. CD cardinal digit
3. DT determiner
4. EX existential there (like: "there is" ... think of it like "there exists")
5. FW foreign word
6. IN preposition/subordinating conjunction
7. JJ adjective 'big'
8. JJR adjective, comparative 'bigger'
9. JJS adjective, superlative 'biggest'
10. LS list marker 1)
11. MD modal could, will
12. NN noun, singular 'desk'
13. NNS noun plural 'desks'
14. NNP proper noun, singular 'Harrison'
15. NNPS proper noun, plural 'Americans'
16. PDT predeterminer 'all the kids'
17. POS possessive ending parent's
18. PRP personal pronoun I, he, she
19. PRP possessive pronoun my, his, hers
20. RB adverb very, silently,
21. RBR adverb, comparative better
22. RBS adverb, superlative best
23. RP particle give up
24. TO to go 'to' the store.
25. UH interjection errrrrrrrm
26. VB verb, base form take
27. VBD verb, past tense took
28. VBG verb, gerund/present participle taking
29. VBN verb, past participle taken
30. VBP verb, sing. present, non-3d take
31. VBZ verb, 3rd person sing. present takes
32. WDT wh-determiner which
33. WP wh-pronoun who, what
34. WPS possessive wh-pronoun whose
35. WRB wh-abverb where, when

In [16]:
def extract_tagged(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

In [17]:
extract_tagged(tags)

['particularly',
 'important',
 'today',
 'world',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade']

### Lemmatization

In [18]:
lmtzr = WordNetLemmatizer()
print(lmtzr.lemmatize('cacti'))
print(lmtzr.lemmatize('willing'))
print(lmtzr.lemmatize('feet'))
print(lmtzr.lemmatize('stemmed'))

print(lmtzr.lemmatize('cactus'))

cactus
willing
foot
stemmed
cactus


### Stemming

In [19]:
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']

In [20]:
stemmer = SnowballStemmer("english")
[stemmer.stem(x) for x in words_for_stemming]

['stem', 'stem', 'stem', 'stemmer', 'stem', 'feet', 'will']

### Putting it all together

In [25]:
def extract_feature(text):
    words = preprocess(text)
    #print('words: ',words)
    tags = nltk.pos_tag(words)
    #print('tags: ',tags)
    extracted_features = extract_tagged(tags)
    #print('Extracted features: ',extracted_features)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
    #print(stemmed_words)

    result = [lmtzr.lemmatize(x) for x in stemmed_words]
   
    return result

In [23]:
sentence

'The Big brown fox jumped over a lazy dog.'

In [26]:
# Creating the vocabulary
words = extract_feature(sentence)
print(words)

['big', 'brown', 'fox', 'lazi', 'dog']


In [27]:
# Creating the vocabulary
words = extract_feature(sentence2)
print(words)

['particular', 'import', 'today', 'world', 'unstructur', 'natur', 'languag', 'data', 'varieti', 'social', 'medium', 'platform', 'peopl', 'engag', 'day', 'note', 'day', 'decad']


In [28]:
extract_feature("He hurt his right foot while he was wearing white shoes on his feet")

['hurt', 'right', 'foot', 'wear', 'white', 'shoe', 'foot']

## Modelling

### Implementating Bag-Of-Words

Is a collection of words to represent a sentence, disregarding the order in which they appear.

In [29]:
# Function a dictionary with words from the vocabulary
def word_feats(words):
    return dict([(word, True) for word in words])

In [30]:
word_feats(words)

{'particular': True,
 'import': True,
 'today': True,
 'world': True,
 'unstructur': True,
 'natur': True,
 'languag': True,
 'data': True,
 'varieti': True,
 'social': True,
 'medium': True,
 'platform': True,
 'peopl': True,
 'engag': True,
 'day': True,
 'note': True,
 'decad': True}

In [31]:
# Function to parse an entire document
def extract_feature_from_doc(data):
    result = []
    corpus = []
    
    # The responses of the chat bot
    answers = {}
    for (text,category,answer) in data:

        features = extract_feature(text)

        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category] = answer

    return (result, sum(corpus,[]), answers)

In [32]:
# test
extract_feature_from_doc([['this is the input text from the user','category','answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [33]:
# Open a text file and read the content
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data

In [34]:
filename = './datasets/leaves.txt'
data = get_content(filename)

In [35]:
data

[['Hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hello, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['Good morning',
  'Morning',
  'Good Morning. I am Dexter. I will serve your leave enquiries.'],
 ['Good afternoon',
  'Afternoon',
  

In [36]:
features_data, corpus, answers = extract_feature_from_doc(data)

In [37]:
print(features_data[50])

({'mani': True, 'option': True, 'leav': True}, 'Utilized-Optional-Leaves')


In [38]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hello',
 'hi',
 'hey',
 'hey',
 'hi',
 'hey',
 'hello',
 'good',
 'morn',
 'good',
 'afternoon',
 'good',
 'even',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'help',
 'want',
 'help',
 'want',
 'assist',
 'help',
 'great',
 'talk',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thank',
 'thank',
 'much',
 'mani',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'mani',
 'leav',
 'taken',
 'mani',
 'leav',
 'alreadi',
 'taken',
 'mani',
 'annual',
 'leav',
 'mani',
 'annual',
 'leav',
 'taken',
 'mani',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'count',
 'taken',
 'mani',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'numbe

In [39]:
answers

{'Greetings': 'Hello. I am Dexter. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am Dexter. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon. I am Dexter. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am Dexter. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Default-Balance-Annual-Leaves': 'You have 25 annual leaves left.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leave

## Training

In [40]:
## split data into train and test sets
split_ratio = 0.8

In [41]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [42]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [43]:
training_data

[({'good': True, 'night': True}, 'Goodbye'),
 ({'option': True, 'leav': True, 'count': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'hey': True}, 'Greetings'),
 ({'option': True, 'leav': True, 'take': True}, 'Balance-Optional-Leaves'),
 ({'previous': True,
   'year': True,
   'carri': True,
   'forward': True,
   'leav': True},
  'CF'),
 ({'option': True, 'leav': True, 'count': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'option': True}, 'Balance-Optional-Leaves'),
 ({'annual': True, 'leav': True, 'remain': True}, 'Balance-Annual-Leaves'),
 ({'type': True, 'leav': True}, 'Leaves-Type'),
 ({'mani': True, 'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'annual': True, 'leav': True, 'count': True, 'remain': True},
  'Balance-Annual-Leaves'),
 ({'today': True}, 'Opening'),
 ({'remain': True, 'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'tell': True, 'carri': True, 'forward': True, 'leav': True, 'count': True},
  'CF'),
 ({'option': True, 'l

In [44]:
# save the data
np.save('./split-dataset/training_data', training_data)
np.save('./split-dataset/test_data', test_data)

### Decision Tree Classifier

In [48]:
## Load the data
training_data = np.load('./split-dataset/training_data.npy', allow_pickle=True)
test_data = np.load('./split-dataset/test_data.npy', allow_pickle=True)

In [49]:
def train_using_decision_tree(training_data, test_data):    
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [50]:
dtclassifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy:  0.9035087719298246
test set accuracy:  0.7931034482758621


### Naive Baiyes Classifier

In [51]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [54]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_naive_bayes(training_data, test_data)
print(training_set_accuracy)
print(test_set_accuracy)
print("Count of most informative features :: ",len(classifier.most_informative_features()))
classifier.show_most_informative_features()

0.8596491228070176
0.7931034482758621
Count of most informative features ::  70
Most Informative Features
                    leav = None           Greeti : Balanc =     11.9 : 1.0
                   taken = None           Balanc : Utiliz =      5.1 : 1.0
                   count = True           Utiliz : Balanc =      3.9 : 1.0
                    help = True             Help : Closin =      3.9 : 1.0
                      hi = None           Utiliz : Greeti =      3.1 : 1.0
                   carri = None           Utiliz : CF     =      3.0 : 1.0
                  remain = None           Utiliz : Balanc =      3.0 : 1.0
                   thank = None           Utiliz : Closin =      2.7 : 1.0
                    mani = True           Defaul : Balanc =      2.6 : 1.0
                    take = True           Defaul : Balanc =      2.1 : 1.0


In [56]:
# The kind of inputs accepted by the classifiers
classifier.classify(({'mani': True, 'option': True, 'leav': True}))

'Utilized-Optional-Leaves'

In [57]:
# Passing normal user input to the classifier
input_sentence = "how many balanced leaves do I have?"
classifier.classify(word_feats(extract_feature(input_sentence)))

'Utilized-Optional-Leaves'

In [58]:
## Creating a reply function
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_feature(input_sentence)))
    return answers[category]

In [59]:
reply('Hi')

'Hello. I am Dexter. I will serve your leave enquiries.'

In [60]:
reply('How many annual leaves do I have left?')

'You have 25 annual leaves remaining.'

In [61]:
reply('How many leaves have I taken?')

'You have 25 annual leaves left.'

In [62]:
reply('Thanks!')

"It's glad to know that I have been helpful. Have a good day!"

# End 

# Follow me for Next Episodes where i will cover

1. Bundle the basic chatbot in a REST API
2. Make it accessible from a REST API
3. Make it Accessible from a mobile app
4. Build another chatbot using sci-kit learn algorithms and not NLTK
5. Build another chatbot using Deep Learning Techniques
6. Expose the two chatbots using REST API End-points
7. Make both chatbots accessibgle from a REST API
8. Make both chatbots accessible from mobile app