In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
%cd gdrive/My Drive/

/content/gdrive/My Drive


In [7]:
%cd chatbot-using-tensorflow

/content/gdrive/My Drive/chatbot-using-tensorflow


# **Retrieval Chatbots**


*  Falls under closed domain because chatbot's can't handle open domains.*  Simpler in nature
*  Depends on a predefined database of question-answer pairs
*  Good for FAQ or simple client queries
*  Pipeline <br/>
   1. Encode questions into vectors using a predefined method
   2. Use a predefined similarity measure, find the most similar question in the DB.
   3. Return the answer(s) of the questions 
*  Choosing the Ideal Similarity Measures
   * Vector Similarity Measures <br/>
      * Determines the similarity between two questions(input questions and database)
      * Most common measures for vectors: <br/>
        1. Manhattan similarity (L1) <br/>
           - sum of the differences between the components of two vectors
           - Example: X = [0, 10], Y = [5, 5] <br/>
           L1 = |5-0| + |5-10| = 5 + 5 = 10
        2. Euclidean similarity (L2) <br/>
        - Square root of sum of squared differences of components of the two vectors
        - Example: X = [0, 10], Y = [5, 5] <br/>
        L2 = sqrt[(5-0) ^ 2 + (5-10) ^2] <br/>
           = sqrt[5 ^ 2 + (-5) ^ 2]<br/>
           = sqrt[25 + 25] <br/>
           = 7.07
        3. Cosine similarity (L3)
        - Cosine of the angle between two vectors
        - Example: X = [0, 10], Y = [5, 5] <br/>
        L3 = dot_product(X, Y) /(||X|| * ||Y||)
        = ((0*5) + (10*5)) / (sqrt(0^2 + 10^2) * sqrt(5^2 + 5^2))
        = (50)/(10 * 0.7)
        = 50 / 70.07
        = 0.707
        

# **TF-IDF**

*  A numerical static that is intended to reflect how important a word is to a document in a collection or corpus
*  Term Frequency
   - Determines how important a word is by calculating its frequency in the docment.
   - Is local to a document
*  Document Frequency
   - Calculates a score for each word that determines its importance in helping distinguish a certain document in a corpus
   - In order for a word to help identify a document, it has to be a signature word and shouldn't appear many times in other documents
   

In [9]:
# Loading preprocessing libraries
import pandas as pd
import numpy as np

# Loading vectorizer and similarity measure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Reading the data**

In [10]:
df = pd.read_csv("aws_faq.csv")
df.dropna(inplace=True)

**Training the vectorizer**

In [13]:
vectorizer = TfidfVectorizer()
vectorizer.fit(np.concatenate((df.Question, df.Answer)))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

**Vectorizing questions**

In [14]:
Question_vectors = vectorizer.transform(df.Question)

**Chatting with the user**

In [16]:
print("You can start chatting with me now")
while True:
    # reading user input
    input_question = input()

    # locating the closest question
    input_question_vector = vectorizer.transform([input_question])

    # computing similarities
    similarities = cosine_similarity(input_question_vector, Question_vectors)

    # finding the closest question
    closest = np.argmax(similarities, axis
 =1)

    # displaying the correct answer
    print("Bot: "+df.Answer.iloc[closest].values[0])

You can start chatting with me now
hello bot
Bot: Hello
how are you?
Bot: I’m fine thanks.
can you tell the pricing of ec2?
Bot: EC2 Fleet comes at no additional charge, you only pay for the underlying resources that EC2 Fleet launches.
okay thanks
Bot: Amazon Elastic Compute Cloud (Amazon EC2) is a web service that provides resizable compute capacity in 
the cloud
. It is designed to make web-scale computing easier for developers.
bye
Bot: Bye, glad to have helped.


KeyboardInterrupt: ignored