In [1]:
import pandas as pd
import re
import string
import numpy as np
import nltk
import sys
sys.path.append("../data")
from test_data import original, easy, medium, hard
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
#nltk.download("punkt")
#nltk.download("wordnet")
#nltk.download("stopwords")

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [3]:
def lemmatized_set(data):
    lemmatized = []
    for sentence in data:
        tokens = nltk.word_tokenize(sentence.lower())
        tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
        lemm = [lemmatizer.lemmatize(t) for t in tokens]
        lemmatized.append(' '.join(lemm))
    return lemmatized

def stemmed_set(data):
    stemmed = []
    for sentence in data:
        tokens = nltk.word_tokenize(sentence.lower())
        tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
        stemms = [stemmer.stem(t) for t in tokens]
        stemmed.append(' '.join(stemms))
    return stemmed

def train_knn(train_vectors, metric):
    return NearestNeighbors(n_neighbors=100, metric=metric).fit(train_vectors)

def k_nearest(model, matrix):
    distance, indices = model.kneighbors(matrix.reshape(1,-1))
    return distance, indices

def ranking(model, test_data, corpus):
    total_rank = 0
    for elem in test_data:
        changed_question, original = elem[0], elem[1]
        _, indices = k_nearest(model, changed_question)
        results = corpus[indices[0]]
        for i, question in enumerate(results):
            if original == question:
                total_rank += i
                break
        if i == 99:
            total_rank += 200
    avg_rank = total_rank / len(test_data)
    return avg_rank

In [4]:
corpus_train = pd.read_csv('../data/insurance_qna_dataset.csv', delimiter='\t')
corpus_train.head()

Unnamed: 0.1,Unnamed: 0,Question,Answer
0,0,What Happens When Term Life Insurance Is Paid ...,Actually term life insurance cannot be paid up...
1,1,What Happens When Term Life Insurance Is Paid ...,Term life insurance is never paid up. Assuming...
2,2,What Happens When Term Life Insurance Is Paid ...,Term Life Insurance does not have the option t...
3,3,What Does Renters Insurance Cover?,A renters insurance policy will typically prov...
4,4,What Does Renters Insurance Cover?,If you apartment was on fire and all your pers...


In [5]:
corpus_questions = corpus_train['Question'].drop_duplicates()
corpus_questions_list = [q.strip().translate(str.maketrans("","", string.punctuation)) for q in corpus_questions]
corpus_questions_list[:5]

['What Happens When Term Life Insurance Is Paid Up',
 'What Does Renters Insurance Cover',
 'Does Owning A Pitbull Raise Homeowners Insurance',
 'What Should You Look For In Long Term Care Insurance',
 'Will Medicare Pay For Smoking Cessation']

In [6]:
original_questions = original
changed_questions_test = easy + medium + hard
corpus_test = pd.DataFrame({'Original': 3 * original_questions, 'Changed': changed_questions_test})
corpus_test.head()

Unnamed: 0,Original,Changed
0,Why Do They Take Blood And Urine For Life Insu...,Why Do They Take Bloods And Urine For Lifes In...
1,What Are Rates For Long Term Care Insurance?,What Are Rate For Long Term Care Insurance?
2,Where Can I Get Weekly Car Insurance?,Where Can I Get Weekly Car Insurances?
3,Who Has The Best Critical Illness Insurance?,Who Has The Best Critical Illness Insurances?
4,Can You Have Car Insurance Without Owning A Car?,Can You Have Cars Insurance Without Owning A Car?


In [7]:
corpus_test_changed = corpus_test['Changed']
test_changed_list = [changed.strip().translate(str.maketrans("","",string.punctuation)) for changed in corpus_test_changed]
corpus_test_original = corpus_test['Original']
test_original_list = [original.strip().translate(str.maketrans("","",string.punctuation)) for original in corpus_test_original]

In [8]:
metrics = ['cosine', 'manhattan', 'euclidean']

vectorizers = [
    ("CountVectorizer Unigram", CountVectorizer(ngram_range=(1,1))),
    ("CountVectorizer Bigram", CountVectorizer(ngram_range=(1,2))),
    ("TFIDFVectorizer Unigram", TfidfVectorizer(ngram_range=(1,1))),
    ("TFIDFVectorizer Bigram", TfidfVectorizer(ngram_range=(1,2)))
]

processing_types = [
    ("original", corpus_questions_list, test_changed_list, test_original_list),
    ("lemmatized", lemmatized_set(corpus_questions_list), lemmatized_set(test_changed_list), lemmatized_set(test_original_list)),
    ("stemmed", stemmed_set(corpus_questions_list), stemmed_set(test_changed_list), stemmed_set(test_original_list))
]

In [9]:
all_results = []

for name, train_set, test_set, orig_set in processing_types:
    for vect_name, vectorizer in vectorizers:
        vectorized_train = vectorizer.fit_transform(train_set)
        vectorized_test = vectorizer.transform(test_set)
        test_data = list(zip(vectorized_test.toarray(), orig_set))
        for metric in metrics:
            model = train_knn(vectorized_train, metric)
            score = ranking(model, test_data, np.array(train_set))
            all_results.append({
                "Processing" : name,
                "Vectorizer" : vect_name,
                "Distance" : metric,
                "Avg rank" : score
            })
            print(f"{name} | {vect_name} | {metric}: {score:.2f}")

original | CountVectorizer Unigram | cosine: 44.47
original | CountVectorizer Unigram | manhattan: 47.30
original | CountVectorizer Unigram | euclidean: 49.77
original | CountVectorizer Bigram | cosine: 43.35
original | CountVectorizer Bigram | manhattan: 54.18
original | CountVectorizer Bigram | euclidean: 54.08
original | TFIDFVectorizer Unigram | cosine: 46.55
original | TFIDFVectorizer Unigram | manhattan: 119.85
original | TFIDFVectorizer Unigram | euclidean: 46.55
original | TFIDFVectorizer Bigram | cosine: 46.38
original | TFIDFVectorizer Bigram | manhattan: 133.27
original | TFIDFVectorizer Bigram | euclidean: 46.38
lemmatized | CountVectorizer Unigram | cosine: 57.20
lemmatized | CountVectorizer Unigram | manhattan: 47.55
lemmatized | CountVectorizer Unigram | euclidean: 50.63
lemmatized | CountVectorizer Bigram | cosine: 54.27
lemmatized | CountVectorizer Bigram | manhattan: 69.37
lemmatized | CountVectorizer Bigram | euclidean: 74.57
lemmatized | TFIDFVectorizer Unigram | co

In [10]:
df_results = pd.DataFrame(all_results)
print(df_results.pivot_table(index=['Processing','Vectorizer'], columns='Distance', values='Avg rank'))

Distance                               cosine  euclidean   manhattan
Processing Vectorizer                                               
lemmatized CountVectorizer Bigram   54.266667  74.566667   69.366667
           CountVectorizer Unigram  57.200000  50.633333   47.550000
           TFIDFVectorizer Bigram   69.300000  73.683333  130.400000
           TFIDFVectorizer Unigram  69.650000  70.416667  115.716667
original   CountVectorizer Bigram   43.350000  54.083333   54.183333
           CountVectorizer Unigram  44.466667  49.766667   47.300000
           TFIDFVectorizer Bigram   46.383333  46.383333  133.266667
           TFIDFVectorizer Unigram  46.550000  46.550000  119.850000
stemmed    CountVectorizer Bigram   53.666667  63.150000   71.000000
           CountVectorizer Unigram  40.416667  46.433333   46.916667
           TFIDFVectorizer Bigram   66.716667  70.700000  126.266667
           TFIDFVectorizer Unigram  70.766667  71.250000  113.783333


In [11]:
best_metrics = df_results.loc[df_results.groupby(['Processing', 'Vectorizer'])["Avg rank"].idxmin()]
print("Best combinations for every kombination:\n")
print(best_metrics[["Processing", "Vectorizer", "Distance", "Avg rank"]])

Best combinations for every kombination:

    Processing               Vectorizer   Distance   Avg rank
15  lemmatized   CountVectorizer Bigram     cosine  54.266667
13  lemmatized  CountVectorizer Unigram  manhattan  47.550000
21  lemmatized   TFIDFVectorizer Bigram     cosine  69.300000
18  lemmatized  TFIDFVectorizer Unigram     cosine  69.650000
3     original   CountVectorizer Bigram     cosine  43.350000
0     original  CountVectorizer Unigram     cosine  44.466667
9     original   TFIDFVectorizer Bigram     cosine  46.383333
6     original  TFIDFVectorizer Unigram     cosine  46.550000
27     stemmed   CountVectorizer Bigram     cosine  53.666667
24     stemmed  CountVectorizer Unigram     cosine  40.416667
33     stemmed   TFIDFVectorizer Bigram     cosine  66.716667
30     stemmed  TFIDFVectorizer Unigram     cosine  70.766667


In [12]:
final_best = df_results.loc[df_results["Avg rank"].idxmin()]
print("Best combination: ")
print(final_best)

Best combination: 
Processing                    stemmed
Vectorizer    CountVectorizer Unigram
Distance                       cosine
Avg rank                    40.416667
Name: 24, dtype: object
