In [1]:
%matplotlib inline
import pandas as pd
pd.options.mode.chained_assignment = None
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from string import punctuation
from collections import Counter
import re
import numpy as np
import csv

In [2]:
cq = pd.read_csv('test-utterances_clean.csv', header=0, encoding='latin1')
cq.drop('Pass/ Fail', axis=1, inplace=True)
cq.columns = ["TrainingUtterance", "TrainingAnswer", "TestUtterance"]
cq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 3 columns):
TrainingUtterance    145 non-null object
TrainingAnswer       145 non-null object
TestUtterance        145 non-null object
dtypes: object(3)
memory usage: 3.5+ KB


In [3]:
train_corpus = {"Question" : [], "Answer": []}

In [4]:
with open("All Questions.txt", 'r', encoding='latin1') as f:
    for line in f.readlines():
        if line.lower().startswith('q:'):
            ques = line.replace('Q:', '').replace('\n', '').lstrip()
            train_corpus["Question"].append(ques)
        if line.lower().startswith('a:'):
            ans = line.replace('A:', '').replace('\n', '').lstrip()
            train_corpus["Answer"].append(ans)

In [5]:
len(train_corpus["Question"])

661

In [6]:
data = pd.DataFrame.from_dict(train_corpus)
data['qna'] = data[data.columns[[0,1]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 661 entries, 0 to 660
Data columns (total 3 columns):
Answer      661 non-null object
Question    661 non-null object
qna         661 non-null object
dtypes: object(3)
memory usage: 15.6+ KB


In [7]:
def tokenizer(text):
    try:
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', 
                                            u'\u2014', u'\u2026', u'\u2013'], tokens))
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Error as e:
        print(e)

In [8]:
# Read synonyms file into a dictionary.
synonyms = {}
with open("di_synonyms.csv", 'r', encoding='latin1') as sf:
    for row in sf.readlines():
        row = row.replace("\n", '').split(",")
        canonical, rest = row[0], row[1:]
        for term in rest:
            if term:
                synonyms[term.lower()] = canonical.lower()
                synonyms[canonical.lower()] = canonical.lower()

In [9]:
def syn_words(sentence):
    tokens = word_tokenize(sentence)
    for seq in ngrams(tokens, 3):
        rep_str = ' '.join(seq)
        if rep_str.lower() in synonyms:
            sentence = sentence.replace(rep_str, synonyms[rep_str.lower()])
    for seq in ngrams(tokens, 2):
        rep_str = ' '.join(seq)
        if rep_str.lower() in synonyms:
            sentence = sentence.replace(rep_str, synonyms[rep_str.lower()])
    for seq in ngrams(tokens, 1):
        rep_str = ' '.join(seq)
        if rep_str.lower() in synonyms:  
            sentence = sentence.replace(rep_str, synonyms[rep_str.lower()])    
    return sentence

In [10]:
cq['CleanedTestUtterance'] = cq['TestUtterance'].map(syn_words)

In [11]:
row_ids = cq[cq['CleanedTestUtterance'] != cq['TestUtterance']].index
row_ids

Int64Index([  0,   1,   2,   4,   5,   7,   8,   9,  11,  12,  16,  18,  20,
             21,  26,  28,  29,  32,  34,  35,  37,  38,  39,  42,  43,  45,
             46,  50,  53,  59,  61,  67,  68,  69,  71,  73,  74,  75,  81,
             84,  86,  89,  90,  92,  93,  96, 103, 104, 105, 106, 107, 110,
            111, 114, 115, 117, 118, 119, 121, 123, 125, 127, 128, 131, 134,
            135, 136, 137, 141],
           dtype='int64')

In [12]:
data['CleanedQuestion'] = data['Question'].map(syn_words)

In [13]:
def mapped_ans_with_confidence(tq_idx):
    matched_idx = best_match[tq_idx].argmax()
    matched_confidence = best_match[tq_idx][matched_idx]
    matched_ques = data['Question'][matched_idx]
    return matched_ques

def top_n_matches(tq_idx, n=3):
    matched_indices = best_match[tq_idx].argsort()[-n:]
    return [data['Question'][qidx] for qidx in matched_indices] 

In [32]:
vocab = ['qna', 'CleanedQuestion']
vsm1 = ['qna', 'CleanedQuestion']
vsm2 = ['CleanedTestUtterance']
min_df_options = list(range(1,11,1))
max_features = list(range(1000, 11000, 1000))
results = []

In [33]:
vectorizer = TfidfVectorizer(min_df=2, max_features=10000, tokenizer=tokenizer, ngram_range=(1, 2))
for col in vocab:
    vocab_vec = vectorizer.fit_transform(list(data[col]))
    for v1 in vsm1:
        vec1 = vectorizer.transform(list(data[v1]))
        for v2 in vsm2:
            vec2 = vectorizer.transform(list(cq[v2]))
            best_match = cosine_similarity(vec2, vec1)
            
            res_matrix = [mapped_ans_with_confidence(i) for i in range(0, len(cq))]
            cq['matched_ques'] = res_matrix
            
            topn_matrix = [top_n_matches(i) for i in range(0, len(cq))]
            cq['topn'] = topn_matrix
            
            cq['IsBestMatch'] = np.where(cq['TrainingUtterance'] == cq['matched_ques'], 1, 0)
            cq['IsTopNMatch'] = cq.apply(lambda x : 1 if x['TrainingUtterance'] in x['topn'] else 0, axis=1)
            
            best_topn_accuracy = (cq['IsTopNMatch']==1).sum() / (len(cq)-1)
            best_accuracy =      (cq['IsBestMatch']==1).sum() / (len(cq)-1)
            results.append({
                "vocab" : col,
                "vsm1"  : v1,
                "vsm2"  : v2,
                "BestAccuracy" : round(best_accuracy*100,2),
                "TopNAccuracy" : round(best_topn_accuracy*100,2)
            })

In [34]:
results = sorted(results, key=lambda k: k['TopNAccuracy']) 

In [35]:
class ListTable(list):  
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            
            for col in row:
                html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)

In [37]:
table = ListTable()
table.append(['<b>Vocabulary</b>', '<b>Target Text </b>', '<b>Top N Match Percentage</b>', '<b>Best Match Percentage</b>', '<b>Source Text</b>'])
for d in results:
    table.append(d.values())
table

0,1,2,3,4
Vocabulary,Target Text,Top N Match Percentage,Best Match Percentage,Source Text
CleanedQuestion,qna,54.86,34.03,CleanedTestUtterance
qna,qna,61.11,38.19,CleanedTestUtterance
CleanedQuestion,CleanedQuestion,61.11,47.22,CleanedTestUtterance
qna,CleanedQuestion,69.44,50.0,CleanedTestUtterance


From the above experiment we see that training the bot on question-and-answer pair as a single document gives the best result.

In [19]:
review_df = pd.read_csv('avadhoot_analysis.csv', header=0, encoding='latin1')
review_df.drop(' Pass/Fail', axis=1, inplace=True)
review_df.columns = ["TrainingUtterance", "TrainingAnswer", "TestUtterance", 'Persistent Bot Result','AvadhootComment']
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 5 columns):
TrainingUtterance        145 non-null object
TrainingAnswer           145 non-null object
TestUtterance            145 non-null object
Persistent Bot Result    90 non-null object
AvadhootComment          46 non-null object
dtypes: object(5)
memory usage: 5.7+ KB


In [20]:
model = TfidfVectorizer(min_df=2, max_features=10000, tokenizer=tokenizer, ngram_range=(1, 2))
model.fit(list(data['qna']))
v1 = model.transform(list(cq['CleanedTestUtterance']))
v2 = model.transform(list(data['CleanedQuestion']))
cosine_distances = cosine_similarity(v1, v2)

In [21]:
oss_answers = [ data.Question[row.argmax()] for row in cosine_distances]
review_df['OSS_ANS'] = oss_answers

In [22]:
def top_n_matches_model(tq_idx, n=3):
    matched_indices = cosine_distances[tq_idx].argsort()[-n:]
    return [data['Question'][qidx] for qidx in matched_indices] 

topn_matrix = [top_n_matches_model(i) for i in range(0, len(review_df))]
review_df['topn'] = topn_matrix

In [23]:
review_df = review_df[~review_df['AvadhootComment'].isin(['Enhancement: greeting handling', 'Bug'])]

In [24]:
review_df['IsBestMatch'] = np.where(review_df['TrainingUtterance'] == review_df['OSS_ANS'], 1, 0)
best_accuracy = (review_df['IsBestMatch']==1).sum() / (len(review_df)-1)
print("BestAccuracy {}".format(round(best_accuracy*100,2)))

BestAccuracy 51.09


In [25]:
review_df['IsTopNMatch'] = review_df.apply(lambda x : 1 if x['TrainingUtterance'] in x['topn'] else 0, axis=1)
best_topn_accuracy = (review_df['IsTopNMatch']==1).sum() / (len(review_df)-1)
print("BestTopNAccuracy {}".format(round(best_topn_accuracy*100,2)))

BestTopNAccuracy 70.07


In [26]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 0 to 144
Data columns (total 9 columns):
TrainingUtterance        138 non-null object
TrainingAnswer           138 non-null object
TestUtterance            138 non-null object
Persistent Bot Result    89 non-null object
AvadhootComment          39 non-null object
OSS_ANS                  138 non-null object
topn                     138 non-null object
IsBestMatch              138 non-null int32
IsTopNMatch              138 non-null int64
dtypes: int32(1), int64(1), object(7)
memory usage: 10.2+ KB


In [27]:
review_df.drop(['TrainingAnswer', 'Persistent Bot Result'], inplace=True, axis=1)
review_df.to_csv("DI_bot_results.tsv", sep='\t', quoting=csv.QUOTE_NONE, index=False)