# Core engine of a chat bot

Human language is astoundingly complex and diverse. When we write, we often misspell or abbreviate words, or omit punctuation. There is a lot of unstructured data around us. Natural language processing helps computers communicate with humans in their own language and scales other language-related tasks. For example, NLP makes it possible for computers to read text, interpret it, measure sentiment and determine which parts are important. Understanding this will enable you to build the core component of any conversational chatbot. This is the core engine of a conversational chatbot </br>

Detecting patterns is a central part of Natural Language Processing. Words ending in -ed tend to be past tense verbs. Frequent use of will is indicative of news text (3). These observable patterns — word structure and word frequency — happen to correlate with particular aspects of meaning, such as tense and topic.</br>

Here I will be creating the core engine of the chat bot.

### Import required libraries

In [1]:
import nltk

In [2]:
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [3]:
## Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [4]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### 1. Pre-processing

In [5]:
sentence = "This is a CHAT BOT Engine POC"
sentence2 = "This is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

In [6]:
sentence

'This is a CHAT BOT Engine POC'

In [7]:
## Lower the text that is input for pre-processing
sentence.lower()
sentence2.lower()

'this is a chat bot engine poc'

"this is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

#### Tokenize the text - extracting individual words

In [8]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens 

tokens2 = tokenizer.tokenize(sentence2)
tokens2

['This', 'is', 'a', 'CHAT', 'BOT', 'Engine', 'POC']

['This',
 'is',
 'particularly',
 'important',
 'in',
 'today',
 's',
 'world',
 'where',
 'we',
 'are',
 'swamped',
 'with',
 'unstructured',
 'natural',
 'language',
 'data',
 'on',
 'the',
 'variety',
 'of',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'in',
 'now',
 'a',
 'days',
 'note',
 'now',
 'a',
 'days',
 'in',
 'the',
 'decade',
 'of',
 '2010',
 '2020']

#### Stopwords : Filter out the non - useful words

In [9]:
filtered_words = [word for word in tokens if not word in stopwords.words('english')]
filtered_words

['This', 'CHAT', 'BOT', 'Engine', 'POC']

In [10]:
filtered_words2 = [word for word in tokens2 if not word in stopwords.words('english')]
filtered_words2

['This',
 'particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

#### Putting all pre-processing steps together

In [11]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [word for word in tokens if not word in stopwords.words('english')]
    return filtered_words

In [12]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['chat', 'bot', 'engine', 'poc']


### 2. Tagging

In [13]:
tags = nltk.pos_tag(preprocessed_sentence)
tags

[('chat', 'RB'), ('bot', 'VBZ'), ('engine', 'NN'), ('poc', 'NN')]

In [14]:
tags = nltk.pos_tag(preprocess(sentence2))
print(tags)

[('particularly', 'RB'), ('important', 'JJ'), ('today', 'NN'), ('world', 'NN'), ('swamped', 'VBD'), ('unstructured', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('variety', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('platforms', 'NNS'), ('people', 'NNS'), ('engage', 'VBP'), ('days', 'NNS'), ('note', 'VBP'), ('days', 'NNS'), ('decade', 'NN'), ('2010', 'CD'), ('2020', 'CD')]


#### Extracting only Nouns and Verbs

In [15]:
def extract_tagged_words(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features


In [16]:
extract_tagged_words(tags)

['particularly',
 'important',
 'today',
 'world',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade']

#### Lemmatize words

In [19]:
lmtzr = WordNetLemmatizer()

print(lmtzr.lemmatize('cacti'))
print(lmtzr.lemmatize('willing'))
print(lmtzr.lemmatize('feet'))
print(lmtzr.lemmatize('stemmed'))

print(lmtzr.lemmatize('cactus'))

cactus
willing
foot
stemmed
cactus


#### Stem the words 

In [20]:
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']

In [22]:
stemmer = SnowballStemmer("english")
[stemmer.stem(x) for x in words_for_stemming]

['stem', 'stem', 'stem', 'stemmer', 'stem', 'feet', 'will']

#### Putting it all together

In [31]:
def extract_features(text):
    words = preprocess(text)
    tags = nltk.pos_tag(words)
    extracted_features = extract_tagged_words(tags)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
    result = [lmtzr.lemmatize(x) for x in stemmed_words]
    
    return result

In [32]:
sentence

'This is a CHAT BOT Engine POC'

In [34]:
words = extract_features(sentence)
print(words)

['chat', 'bot', 'engin', 'poc']


In [35]:
sentence2

"This is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

In [36]:
words = extract_features(sentence2)
print(words)

['particular', 'import', 'today', 'world', 'unstructur', 'natur', 'languag', 'data', 'varieti', 'social', 'medium', 'platform', 'peopl', 'engag', 'day', 'note', 'day', 'decad']


### 3. Implementing Bag of Words|
In simple terms, it’s a collection of words to represent a sentence, disregarding the order in which they appear.

In [37]:
def word_features(words):
    return dict([word,True] for word in words)

In [38]:
word_features(words)

{'particular': True,
 'import': True,
 'today': True,
 'world': True,
 'unstructur': True,
 'natur': True,
 'languag': True,
 'data': True,
 'varieti': True,
 'social': True,
 'medium': True,
 'platform': True,
 'peopl': True,
 'engag': True,
 'day': True,
 'note': True,
 'decad': True}

#### Parsing the whole document

In [39]:
def extract_features_from_doc(data):
    result = []
    corpus = []
    
    # The response of the chatbot
    answers = {}
    
    for(text,category,answer) in data:
        features = extract_features(text)
        
        corpus.append(features)
        result.append((word_features(features), category))
        answers[category] = answer
    return (result, sum(corpus,[]),answers)

In [41]:
extract_features_from_doc([['this is the input text from the user','category','answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [42]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data

In [46]:
filename = 'leaves.txt'
data = get_content(filename)

In [47]:
data

[['Hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hello, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['Good morning',
  'Morning',
  'Good Morning. I am Dexter. I will serve your leave enquiries.'],
 ['Good afternoon',
  'Afternoon',
  

In [48]:
features_data, corpus, answers = extract_features_from_doc(data)

In [51]:
print(features_data[100])

({'annual': True, 'leav': True, 'balanc': True}, 'Balance-Annual-Leaves')


In [52]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hello',
 'hi',
 'hey',
 'hey',
 'hi',
 'hey',
 'hello',
 'good',
 'morn',
 'good',
 'afternoon',
 'good',
 'even',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'help',
 'want',
 'help',
 'want',
 'assist',
 'help',
 'great',
 'talk',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thank',
 'thank',
 'much',
 'mani',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'mani',
 'leav',
 'taken',
 'mani',
 'leav',
 'alreadi',
 'taken',
 'mani',
 'annual',
 'leav',
 'mani',
 'annual',
 'leav',
 'taken',
 'mani',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'count',
 'taken',
 'mani',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'numbe

In [53]:
answers

{'Greetings': 'Hello. I am Dexter. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am Dexter. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon. I am Dexter. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am Dexter. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Default-Balance-Annual-Leaves': 'You have 25 annual leaves left.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leave

### 4. Training the model using above features

In [54]:
## split data into train and test sets
split_ratio = 0.8

In [55]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [56]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [57]:
training_data

[({'mani': True, 'annual': True, 'leav': True, 'taken': True},
  'Utilized-Annual-Leaves'),
 ({'number': True, 'forward': True, 'leav': True}, 'CF'),
 ({'annual': True, 'leav': True, 'count': True, 'remain': True},
  'Balance-Annual-Leaves'),
 ({'today': True}, 'Opening'),
 ({'want': True, 'help': True}, 'Help'),
 ({'need': True, 'help': True}, 'Help'),
 ({'option': True, 'leav': True, 'count': True, 'use': True},
  'Utilized-Optional-Leaves'),
 ({'option': True, 'leav': True, 'count': True, 'use': True},
  'Utilized-Optional-Leaves'),
 ({'taken': True, 'annual': True, 'leav': True}, 'Utilized-Annual-Leaves'),
 ({'good': True, 'afternoon': True}, 'Afternoon'),
 ({'number': True,
   'annual': True,
   'leav': True,
   'alreadi': True,
   'taken': True},
  'Utilized-Annual-Leaves'),
 ({'option': True, 'leav': True, 'taken': True}, 'Utilized-Optional-Leaves'),
 ({'number': True, 'forward': True, 'leav': True}, 'CF'),
 ({'want': True, 'help': True}, 'No-Help'),
 ({'option': True, 'leav': T

In [58]:
# save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

#### Classification using Decision Tree

In [61]:
training_data = np.load('training_data.npy',allow_pickle=True)
test_data = np.load('test_data.npy', allow_pickle=True)

In [62]:
def train_using_decision_tree(training_data, test_data):
    
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [63]:
dtclassifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy:  0.9035087719298246
test set accuracy:  0.8620689655172413


#### Classification using Naive Bayes

In [64]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [65]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_naive_bayes(training_data, test_data)
print(training_set_accuracy)
print(test_set_accuracy)

0.8596491228070176
0.6896551724137931


In [66]:
print(len(classifier.most_informative_features()))

68


In [67]:
classifier.show_most_informative_features()

Most Informative Features
                    leav = None           Greeti : Balanc =     12.7 : 1.0
                    mani = True           Defaul : Balanc =     10.0 : 1.0
                 alreadi = True           Defaul : Utiliz =      4.6 : 1.0
                   taken = None           Balanc : Utiliz =      3.6 : 1.0
                   count = True           Utiliz : Utiliz =      2.8 : 1.0
                      hi = None           Utiliz : Greeti =      2.8 : 1.0
                  remain = None           Utiliz : Balanc =      2.7 : 1.0
                   carri = None           Utiliz : CF     =      2.6 : 1.0
                    take = True           Defaul : Balanc =      2.5 : 1.0
                  remain = True           Balanc : Defaul =      2.0 : 1.0


In [69]:
classifier.classify(({'number': True, 'annual': True, 'leav': True, 'remain': True}))

'Balance-Annual-Leaves'

#### Checking the features

In [71]:
extract_features("hello")

['hello']

In [72]:
extract_features("Leaves")

['leav']

In [74]:
word_features(extract_features("hello"))

{'hello': True}

In [76]:
input_sentence = "how many balanced leaves do I have?"
classifier.classify(word_features(extract_features(input_sentence)))

'Utilized-Optional-Leaves'

In [79]:
def reply(input_sentence):
    category = dtclassifier.classify(word_features(extract_features(input_sentence)))
    return answers[category]

In [80]:
reply('Hi')

'Hello. I am Dexter. I will serve your leave enquiries.'

In [81]:
reply('How many annual leaves do I have left?')

'You have 25 annual leaves remaining.'

In [82]:
reply('How many leaves have I taken?')

'You have used 12 annual leaves.'

In [83]:
reply('Thanks!')

"It's glad to know that I have been helpful. Have a good day!"