# Retrieval-Based ChatBot
The most popular chatbot implementation in use today!

Retrieval-Based ChatBots perform **three main tasks**:

1. [**Intent Classification**](#IntentClassification)  
Classify the intent of the message from user input.
    1. [Intent Classification with Bag-of-Words](#ICBoW)
    2. [Intent Classification with Term Frequency-Inverse Document Frequency](#ICTF-IDF)  


2. [**Entity Recognition**](#EntityRecognition)  
Entities are often the proper nouns of a message.
    1. [Entity Recognition with Part-of-Speech tagging](#ERPOS)
    2. [Entity Recognition with Word Embeddings](#ERWE)


3. [**Response Selection**](#ResponseSelection)  
Retrieve the best-fit response from this collection


4. [**Working Chatbot Example**](#ChatBot)  

In [16]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

def pos_tagging(word):
    """Tag each word with its part of speech."""
    
    # get the already tagged synonyms of the word 
    probable_pos = wordnet.synsets(word)
    # instantiate Counter()
    pos_counts = Counter()
    
    # count the POS of the word's synonyms
    pos_counts["n"] = len([synonym for synonym in probable_pos if synonym.pos()=="n"])
    pos_counts["v"] = len([synonym for synonym in probable_pos if synonym.pos()=="v"])
    pos_counts["a"] = len([synonym for synonym in probable_pos if synonym.pos()=="a"])
    pos_counts["r"] = len([synonym for synonym in probable_pos if synonym.pos()=="r"])
    
    # find the most common POS of the word's synonyms
    most_likely_pos = pos_counts.most_common(1)[0][0]
    
    return most_likely_pos


def preprocess_text(text):
    """
    1. Strips the text off punctuation.
    2. Lower-case letters
    3. Tokenize letters
    4. Lemmatize letters
    """
    # strip text off punctuation and lower-case letters
    cleaned = re.sub(r'\W+', ' ', text).lower()
    # tokenize text
    tokenized = word_tokenize(cleaned)
    # instantiate WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # lemmatize text with POS
    normalized = [lemmatizer.lemmatize(token, pos_tagging(token)) for token in tokenized]
    
    return normalized

<a name="IntentClassification"> </a>
## Intent Classification

<a name='ICBoW'> </a>
### Intent Classification with BoW

In [31]:
from collections import Counter

user_message = preprocess_text("""Hello! What is the fit of the 'Elosie' dress? 
                               My shoulders are broad, so I often size up for a comfortable fit. 
                               Do dress sizes run large or small? Especially in the shoulders?""")

response_a = preprocess_text("All of our dresses sare cut from a polyester blend for a strechy fit")

response_b = preprocess_text("""The 'Elosie' dress runs large. I suggest you take your regular size or
                             smaller for the best fit.""")


# create and print BoW dictionaries ('word': word frequency)
bow_user_message = Counter(user_message)
print(bow_user_message, '\n')
bow_response_a = Counter(response_a)
print(bow_response_a, '\n')
bow_response_b = Counter(response_b)
print(bow_response_b, '\n')

def compare_overlap(user_message, possible_response):
    """Count the similar words between two BoW dictionaries."""
    similar_words = 0
    #iterate over tokens in user_message
    for token in user_message:
        # if token exist in response
        if token in possible_response:
            # increase similar words by 1
            similar_words += 1
    # return the number of similar words
    return similar_words

# print the number of similar words between message and responses
print("Number of similar words between user message and response A:")
print(compare_overlap(bow_user_message, bow_response_a))
print("\nNumber of similar words between user message and response B:")
print(compare_overlap(bow_user_message, bow_response_b))

Counter({'the': 3, 'be': 2, 'fit': 2, 'dress': 2, 'shoulder': 2, 'size': 2, 'hello': 1, 'what': 1, 'of': 1, 'elosie': 1, 'my': 1, 'broad': 1, 'so': 1, 'i': 1, 'often': 1, 'up': 1, 'for': 1, 'a': 1, 'comfortable': 1, 'do': 1, 'run': 1, 'large': 1, 'or': 1, 'small': 1, 'especially': 1, 'in': 1}) 

Counter({'a': 2, 'all': 1, 'of': 1, 'our': 1, 'dress': 1, 'sare': 1, 'cut': 1, 'from': 1, 'polyester': 1, 'blend': 1, 'for': 1, 'strechy': 1, 'fit': 1}) 

Counter({'the': 2, 'elosie': 1, 'dress': 1, 'run': 1, 'large': 1, 'i': 1, 'suggest': 1, 'you': 1, 'take': 1, 'your': 1, 'regular': 1, 'size': 1, 'or': 1, 'small': 1, 'for': 1, 'best': 1, 'fit': 1}) 

Number of similar words between user message and response A:
5

Number of similar words between user message and response B:
11


<a name="ICTF-IDF"></a>
### Intent classification with TF-IDF

response_a = "Every dress style is cut from a polyester blend for a strechy fit."

response_b = "The 'Elosie' dress runs large. I suggest you take your regular size or smaller."

response_c = "The 'Elosie' dress comes in green, lavender, and orange."

user_message = "Hello! What is the fit of the 'Elosie' dress? My shoulders are broad, so I often size up for a comfortable fit. Do dress sizes run large or small?"

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# create a list of documents (they are already preprocessed)
processed_docs = ['every dress style be cut from a polyester blend for a strechy fit', 
                  'the elosie dress run large i suggest you take your regular size or small',
                  'the elosie dress come in green lavender and orange',
                  'hello what be the fit of the elosie dress my shoulder be broad so i often'
                  'size up for a comfortable fit do dress size run large or small']

# instantiate tfidf vectorizer:
vectorizer = TfidfVectorizer()

# fit and transform vectorizer on processed docs:
tfidf_vectors = vectorizer.fit_transform(processed_docs)

# compute cosine similarity betweeen the user message tf-idf vector and the different response tf-idf vectors
cosine_similarities = cosine_similarity(tfidf_vectors[-1], tfidf_vectors)

# argsort() is a function from the NumPy package which sorts and array and returns the indices in order
# The user_message itself will always have the highest similarity score, and will always be stored at index -1
print(cosine_similarities.argsort())

# get the index of the most similar response to the user message
similar_response_index = cosine_similarities.argsort()[0][-2]

best_response = documents[similar_response_index]

# print best reponse
print(best_response)


[[2 0 1 3]]
The 'Elosie' dress runs large. I suggest you take your regular size or smaller.


<a name="EntityRecognition"></a>
## Entity Recognition 
After determining the best method for the classification of a user’s intent, there is the task of recognizing entities within a user’s message.

<a name="ERPOS"></a>
### Entity Recognition with POS tagging
POS tagging is commonly used to identify entities within a user message, as most entities are nouns.

In [68]:
from nltk import pos_tag

user_message = ["i", "ordered", "two", "t-shirts", "this", "past",
                "weekend", "when","will", "my", "package", "be", "shipped"]

# POS-tag each word
tagged_user_message = pos_tag(user_message)
print(tagged_user_message, '\n')

def extract_nouns(tagged_message):
    """Return a list with just the nouns from a list of words."""
    message_nouns = []
    # for each word in the list of POS-tagged words
    for token in tagged_message:
        # if the word is tagged as a NOUN
        if 'NN' in token[1]:
            # add the word at the end of the list
            message_nouns.append(token[0])
    # return the list of nouns
    return message_nouns

# extract the nouns from a list of POS-tagged words
user_message_nouns = extract_nouns(tagged_user_message)
print(user_message_nouns)

[('i', 'RB'), ('ordered', 'VBD'), ('two', 'CD'), ('t-shirts', 'NNS'), ('this', 'DT'), ('past', 'JJ'), ('weekend', 'NN'), ('when', 'WRB'), ('will', 'MD'), ('my', 'PRP$'), ('package', 'NN'), ('be', 'VB'), ('shipped', 'VBN')] 

['t-shirts', 'weekend', 'package']


<a name="ERWE"></a>
### Entity Recognition with Word Embeddings
While POS tagging extracts key entities in a user message, it does not provide context that allows a chatbot to believably integrate an entity reference into a predefined response.

In order to produce a coherent response, the chatbot must **insert entities from a user message** into the blank spots.

In [86]:
import spacy

# load a word2vec model
word2vec = spacy.load("en_core_web_lg")

# a list of nouns
message_nouns = ['shirts', 'weekend', 'package']

# a board category (the blank spot)
category = word2vec("clothes")

# join words into a single string with a space for seperator
tokens = word2vec(" ".join(message_nouns))

def compute_similarity(tokens, category):
    """Calculate the similarity between a string and a "blank spot" word."""
    output_list = list()
    # for each word in a string
    for token in tokens:
        # print the word, the "blank spot" word, and their similarity score
        # similarity() defaults to the average of the token vectors
        output_list.append([token.text, category.text, token.similarity(category)])
    return output_list

# print the similarity between each word and "blank_spot"
for i in range(3):
    print(compute_similarity(tokens, category)[i])

# assign the word with the highest similarity to the blank_spot, i.e. shirts
blank_spot = message_nouns[0]

# response to the user
bot_response = f"Hey! I just checked my records, your shipment containing {blank_spot} is en route."
"Expect it within the next two days!"

#print bot_response
print('\n',bot_response)

['shirts', 'clothes', 0.678414398517753]
['weekend', 'clothes', 0.2510121169200076]
['package', 'clothes', 0.16207362417098703]

 Hey! I just checked my records, your shipment containing shirts is en route.


<a name="ResponseSelection"> </a>
## Response Selection

In [87]:
stop_words = set(stopwords.words("english"))

def preprocess(input_sentence):
    """Clean a string."""
    # lower case letters
    input_sentence = input_sentence.lower()
    # remove punctuation and whitespace
    input_sentence = re.sub(r'[^\w\s]','',input_sentence)
    # split string into individual words
    tokens = word_tokenize(input_sentence)
    # remove stopwords
    input_sentence = [i for i in tokens if not i in stop_words]
    return(input_sentence)

  
def extract_nouns(tagged_message):
    """Return a list with just the nouns from a list of words."""
    message_nouns = list()
    for token in tagged_message:
        if token[1].startswith("N"):
            message_nouns.append(token[0])
    return message_nouns

In [91]:
user_message = "Good morning... will it rain in Chicago later this week?"

blank_spot = "illinois city"

# a selection of responses to match to the blank spot
response_a = "The average temperature this weekend in {} will be 88 degrees. Bring your sunglasses!"
response_b = "Forget about your umbrella; there is no rain forecasted in {} this weekend."
response_c = "This weekend, a warm front from the southeast will keep skies near {} clear."

responses= [response_a, response_b, response_c]

# preprocess documents
bow_user_message = Counter(preprocess(user_message))
processed_responses = [Counter(preprocess(response)) for response in responses]

# build BoW model
similarity_list = [compare_overlap(doc, bow_user_message) for doc in processed_responses]

# select response with best intent fit
response_index = similarity_list.index(max(similarity_list))

# extracting entities with word2vec 
tagged_user_message = pos_tag(preprocess(user_message))
message_nouns = extract_nouns(tagged_user_message)

# executing word2vec model
tokens = word2vec(" ".join(message_nouns))
category = word2vec(blank_spot)
word2vec_result = compute_similarity(tokens, category)

# select highest scoring entity
print(word2vec_result,'\n')
entity = word2vec_result[2][0]

# select final response with titlecase
final_response = responses[response_index].format(entity.title())
print(final_response)

[['morning', 'illinois city', 0.26479153177805814], ['rain', 'illinois city', 0.2857365552501409], ['chicago', 'illinois city', 0.7571821357578838], ['week', 'illinois city', 0.2169059489038729]] 

Forget about your umbrella; there is no rain forecasted in Chicago this weekend.


<a name="ChatBot"> </a>
## Working ChatBot Example

In [94]:
class ChatBot:
    
    def find_intent_match(self, responses, user_message):
        
        bow_user_message = Counter(preprocess(user_message))
        
        processed_responses = [Counter(preprocess(response)) for response in responses]
        
        similarity_list = [compare_overlap(response, bow_user_message) for response in processed_responses]
        
        response_index = similarity_list.index(max(similarity_list))
        
        return responses[response_index]

    
    def find_entities(self, user_message):
        tagged_user_message = pos_tag(preprocess(user_message))
        message_nouns = extract_nouns(tagged_user_message)

        # execute word2vec model
        tokens = word2vec(" ".join(message_nouns))
        category = word2vec(blank_spot)
        word2vec_result = compute_similarity(tokens, category)
        word2vec_result.sort(key=lambda x: x[2])
        return word2vec_result[-1][0]

    
    def respond(self, user_message):
        best_response = self.find_intent_match(responses, user_message)
        entity = self.find_entities(user_message)
        print(best_response.format(entity))

        
    def chat(self):
        user_message = input("Hi, I'm Stratus. Ask me about your local weather!\n")
        self.respond(user_message)

# create ChatBot() instance:
chatbot = ChatBot()
# call .chat() method:
chatbot.chat()

Hi, I'm Stratus. Ask me about your local weather!
exit
The average temperature this weekend in exit will be 88 degrees. Bring your sunglasses!
