In [69]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from gensim.models import Word2Vec
import numpy as np
import spacy
from collections import Counter
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import string
sys.path.append("../data")
from test_data import original, easy, medium, hard
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

## 1. Data Loading and Preprocessing

In [70]:
corpus_train = pd.read_csv('../data/insurance_qna_dataset.csv', delimiter='\t')
corpus_questions = corpus_train['Question'].drop_duplicates().tolist()

changed_questions_test = easy + medium + hard
corpus_test = pd.DataFrame({'Original': 3 * original, 'Changed': changed_questions_test})

corpus_test_changed = corpus_test['Changed'].tolist()
test_changed_list = [changed.strip().translate(str.maketrans("","",string.punctuation)) for changed in corpus_test_changed]
original_test = corpus_test['Original'].tolist()
original_questions = [original.strip().translate(str.maketrans("","",string.punctuation)) for original in original_test]

## 2. Tokenization, Lemmatization, Stemming

In [71]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocessing(corpus_list):
    filtered_list = []
    for sentence in corpus_list:
        sentence = sentence.replace("?",' ')
        words = word_tokenize(sentence)
        new_sentence = [w.lower().strip() for w in words if w.isalpha()]
        filtered_list.append(new_sentence)
    return filtered_list

def lemmatize_tokenized(tokenized):
    return [[lemmatizer.lemmatize(w) for w in ws] for ws in tokenized]

def stem_tokenized(tokenized):
    return [[stemmer.stem(w) for w in ws] for ws in tokenized]

tokenized_questions = preprocessing(corpus_questions_list)
lemmatized_questions = lemmatize_tokenized(tokenized_questions)
stemmed_questions = stem_tokenized(tokenized_questions)

tokenized_test_changed = preprocessing(test_changed_list)
lemmatized_test_changed = lemmatize_tokenized(tokenized_test_changed)
stemmed_test_changed = stem_tokenized(tokenized_test_changed)

tokenized_original = preprocessing(original_questions)
lemmatized_original = lemmatize_tokenized(tokenized_original)
stemmed_original = stem_tokenized(tokenized_original)

## 3. Embedding Functions (sum, avg, idf)

In [72]:
def sum_Word2vec(all_vectors, model):
    new_list = []
    for sentence_vector in all_vectors:
        if len(sentence_vector) == 0:
            new_list.append(np.zeros(model.vector_size))
            continue
        vector = np.zeros(model.vector_size)
        for word in sentence_vector:
            if word in model.wv.index_to_key:
                vector += model.wv[word]
        new_list.append(vector)
    return new_list

def avg_Word2vec(all_vectors, model):
    new_list = []
    for sentence_vector in all_vectors:
        sum_vector = np.zeros(model.vector_size)
        word_fount = 0
        for word in sentence_vector:
            if word in model.wv.index_to_key:
                sum_vector += model.wv[word]
                word_fount += 1
        if word_fount > 0:
            new_list.append(sum_vector/word_fount)
        else:
            new_list.append(np.zeros(model.vector_size))
    return new_list

def IDF_Word2Vec(all_vectors, model, idf_dict):
    new_list = []
    for sentence_vector in all_vectors:
        if len(sentence_vector) == 0:
            new_list.append(np.zeros(model.vector_size))
            continue
        vector = np.zeros(model.vector_size)
        total_weight = 0
        for word in sentence_vector:
            if word in model.wv.index_to_key and word in idf_dict:
                vector += model.wv[word] * idf_dict[word]
                total_weight += idf_dict[word]
        if total_weight > 0:
            new_list.append(vector / total_weight)
        else:
            new_list.append(np.zeros(model.vector_size))
    return new_list

def IDF_calculator(tokenized_documents):
    number_of_documents = len(tokenized_documents)
    document_freq = Counter()
    for sentence in tokenized_documents:
        unique_words = set(sentence)
        for word in unique_words:
            document_freq[word] += 1
    idf_dict = {word: np.log((number_of_documents + 1) / (freq + 1)) + 1 for word, freq in document_freq.items()}
    return idf_dict

## 4. KNN Utility Functions

In [73]:
def train_knn(train_vectors, metric):
    return NearestNeighbors(n_neighbors=100, algorithm='brute', metric=metric).fit(train_vectors)

def k_nearest(model, matrix):
    distance, indices = model.kneighbors(matrix.reshape(1,-1))
    return distance, indices

def ranking(model, test_data, corpus):
    total_rank = 0
    for elem in test_data:
        changed_question, original = elem[0], elem[1]
        _, indices = k_nearest(model, changed_question)
        results = corpus[indices[0]]
        for i, question in enumerate(results):
            if original == question:
                total_rank += i
                break
        if i == 99:
            total_rank += 200
    avg_rank = total_rank / len(test_data)
    return avg_rank

## 5. Word2Vec Model Training

In [74]:
w2v_cbow_orig = Word2Vec(tokenized_questions, vector_size=100, window=5, min_count=1, epochs=400, workers=7, sg=0,seed=123)
w2v_cbow_lemm = Word2Vec(lemmatized_questions, vector_size=100, window=5, min_count=1, epochs=400, workers=7, sg=0,seed=123)
w2v_cbow_stem = Word2Vec(stemmed_questions, vector_size=100, window=5, min_count=1, epochs=400, workers=7, sg=0,seed=123)

w2v_skip_orig = Word2Vec(tokenized_questions, vector_size=100, window=5, min_count=1, epochs=400, workers=7, sg=1,seed=123)
w2v_skip_lemm = Word2Vec(lemmatized_questions, vector_size=100, window=5, min_count=1, epochs=400, workers=7, sg=1,seed=123)
w2v_skip_stem = Word2Vec(stemmed_questions, vector_size=100, window=5, min_count=1, epochs=400, workers=7, sg=1,seed=123)

## 6. Embedding Preparation and Evaluation

In [75]:
metrics = ['cosine', 'manhattan', 'euclidean']
embeddings = ['sum','avg','idf']
results = []

def sentence_embedding(emb_type, X, model, idf_dict=None):
    if emb_type == 'sum':
        return np.array(sum_Word2vec(X,model))
    elif emb_type == 'avg':
        return np.array(avg_Word2vec(X,model))
    elif emb_type == 'idf':
        return np.array(IDF_Word2Vec(X,model,idf_dict))
    else:
        raise ValueError("Unknown embedding type")

idf_orig = IDF_calculator(tokenized_questions)
idf_lemm = IDF_calculator(lemmatized_questions)
idf_stem = IDF_calculator(stemmed_questions)

In [76]:
for cbow, model, train_X, test_X, orig_X, idf_dict in [
    ('CBOW-original', w2v_cbow_orig, tokenized_questions, tokenized_test_changed, tokenized_original, idf_orig),
    ('CBOW-lemma',    w2v_cbow_lemm,  lemmatized_questions, lemmatized_test_changed, lemmatized_original, idf_lemm),
    ('CBOW-stem',     w2v_cbow_stem, stemmed_questions, stemmed_test_changed, stemmed_original, idf_stem)
]:
    for emb_type in embeddings:
        if emb_type == 'idf':
            train_vectors = sentence_embedding('idf', train_X, model, idf_dict)
            test_vectors = sentence_embedding('idf', test_X, model, idf_dict)
            orig_for_test = [' '.join(ws) for ws in orig_X]
        else:
            train_vectors = sentence_embedding(emb_type, train_X, model)
            test_vectors = sentence_embedding(emb_type, test_X, model)
            orig_for_test = [' '.join(ws) for ws in orig_X]
        test_data = list(zip(test_vectors, orig_for_test))
        for metric in metrics:
            knn_model = train_knn(train_vectors, metric)
            score = ranking(knn_model, test_data, np.array([' '.join(ws) for ws in train_X]))
            results.append({
                'Model': cbow,
                'Embedding': emb_type,
                'Distance': metric,
                'Avg rank': score
            })

In [77]:
for skipgram, model, train_X, test_X, orig_X, idf_dict in [
    ('Skipgram-original', w2v_skip_orig,   tokenized_questions, tokenized_test_changed, tokenized_original, idf_orig),
    ('Skipgram-lemma',    w2v_skip_lemm,    lemmatized_questions, lemmatized_test_changed, lemmatized_original, idf_lemm),
    ('Skipgram-stem',     w2v_skip_stem,   stemmed_questions, stemmed_test_changed, stemmed_original, idf_stem)
]:
    for emb_type in embeddings:
        if emb_type == 'idf':
            train_vectors = sentence_embedding('idf', train_X, model, idf_dict)
            test_vectors = sentence_embedding('idf', test_X, model, idf_dict)
            orig_for_test = [' '.join(ws) for ws in orig_X]
        else:
            train_vectors = sentence_embedding(emb_type, train_X, model)
            test_vectors = sentence_embedding(emb_type, test_X, model)
            orig_for_test = [' '.join(ws) for ws in orig_X]
        test_data = list(zip(test_vectors, orig_for_test))
        for metric in metrics:
            knn_model = train_knn(train_vectors, metric)
            score = ranking(knn_model, test_data, np.array([' '.join(ws) for ws in train_X]))
            results.append({
                'Model': skipgram,
                'Embedding': emb_type,
                'Distance': metric,
                'Avg rank': score
            })

## 7. Results Analysis

In [78]:
df_results = pd.DataFrame(results)
df_results[['Model', 'Processing']] = df_results['Model'].str.split('-', n=1, expand=True)
df_results = df_results.sort_values(['Model', 'Processing', 'Embedding', 'Distance'])
df_results

Unnamed: 0,Model,Embedding,Distance,Avg rank,Processing
12,CBOW,avg,cosine,54.516667,lemma
14,CBOW,avg,euclidean,58.166667,lemma
13,CBOW,avg,manhattan,60.616667,lemma
15,CBOW,idf,cosine,48.983333,lemma
17,CBOW,idf,euclidean,61.483333,lemma
16,CBOW,idf,manhattan,61.383333,lemma
9,CBOW,sum,cosine,54.516667,lemma
11,CBOW,sum,euclidean,57.183333,lemma
10,CBOW,sum,manhattan,60.5,lemma
3,CBOW,avg,cosine,53.833333,original


In [79]:
pivot = df_results.pivot_table(
    index=['Model', 'Processing', 'Embedding'],
    columns='Distance',
    values='Avg rank'
)
pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Distance,cosine,euclidean,manhattan
Model,Processing,Embedding,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CBOW,lemma,avg,54.516667,58.166667,60.616667
CBOW,lemma,idf,48.983333,61.483333,61.383333
CBOW,lemma,sum,54.516667,57.183333,60.5
CBOW,original,avg,53.833333,62.033333,61.983333
CBOW,original,idf,52.333333,64.75,63.983333
CBOW,original,sum,53.833333,57.883333,57.516667
CBOW,stem,avg,43.783333,52.933333,51.183333
CBOW,stem,idf,42.7,57.9,59.483333
CBOW,stem,sum,43.783333,49.766667,49.316667
Skipgram,lemma,avg,41.616667,51.866667,51.55


## 8. Best Result

In [80]:
best_row = df_results.loc[df_results['Avg rank'].idxmin()]
print("\nBest result:")
print(best_row)


Best result:
Model          Skipgram
Embedding           sum
Distance      euclidean
Avg rank      32.166667
Processing         stem
Name: 47, dtype: object
