# Library

In [2]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import gensim
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem_text
from gensim.utils import simple_preprocess
from gensim import corpora

# Code

## Read data

In [3]:
train_data_AD = './Data/train_data.csv'

In [4]:
train_data = []
with open(train_data_AD, 'r') as f:
    train_data = [line.rstrip().split(',') for line in f]

In [5]:
test_data_AD = './Data/test_data.csv'
valid_data_AD = './Data/valid_data.csv'

In [6]:
test_data = []
with open(test_data_AD, 'r') as f:
    test_data = [line.rstrip().split(',') for line in f]

valid_data = []
with open(valid_data_AD, 'r') as f:
    valid_data = [line.rstrip().split(',') for line in f]

## Methods

In [7]:
def TF(document) :
    tokens = [list(gensim.utils.tokenize(doc, lower=True)) for doc in document]
    CUSTOM_FILTERS = [remove_stopwords]
    tokens = [preprocess_string(" ".join(doc), CUSTOM_FILTERS) for doc in tokens]

    for words in tokens :
        word_count = {}
        for word in words:
            if word not in word_count:
                word_count[word] = 1
            elif word in word_count:
                word_count[word] += 1
    
    return word_count

In [8]:
def CF(word_count, occurence) :
    if occurence in word_count.keys() :
        return word_count[occurence]
    return 0

In [9]:
def C(word_count, occurences) :
    c = 0
    for occurence in occurences :
        c += CF(word_count, occurence)
    return c

In [10]:
def find_same_words(test_word, train_word) :
    test_index = []
    train_index= []

    for w in test_word :
        if w in train_word :
            test_index.append(test_word.index(w))
            train_index.append(train_word.index(w))

    return [test_index, train_index]

In [11]:
def cosine_similarity(x, y):
    if len(x) != len(y) :
        return None

    cosine_similarity = dot(x, y)/(norm(x)*norm(y))
    
    return cosine_similarity

In [12]:
def calculate_TF_IDF_Vector(document) :
    tokens = [list(gensim.utils.tokenize(doc, lower=True)) for doc in document]
    CUSTOM_FILTERS = [remove_stopwords]
    tokens = [preprocess_string(" ".join(doc), CUSTOM_FILTERS) for doc in tokens]

    listToStr = []
    for s in tokens :
        listToStr.append(' '.join(map(str, s)))

    g_dict = corpora.Dictionary([simple_preprocess(line) for line in listToStr])
    g_bow = [g_dict.doc2bow(simple_preprocess(line)) for line in listToStr]

    g_tfidf = gensim.models.TfidfModel(g_bow, smartirs='ntc')

    return [g_dict, g_bow, g_tfidf]

In [13]:
def match_words_with_tf_idf(document) :
    g_dict, g_bow, g_tfidf = calculate_TF_IDF_Vector([document])

    tf_idf = []
    words = []
    for item in g_tfidf[g_bow]:
        for id, freq in item :
            words.append(g_dict[id])
            tf_idf.append(np.around(freq, decimals=2))

    return [words, tf_idf]

## Assessment

### MAP

In [14]:
def MAP(precision) :
    if len(precision) == 0 :
        return 0
    
    s = 0
    for pre in precision :
        if len(pre) > 0 :
            x = 0
            for i in pre :
                x += i
            s += ( x / len(pre) )
        
    return s / len(precision)

### P@k

In [15]:
def Precision5(precision) :
    if len(precision) < 5 :
        return 0
    return precision[4]

In [16]:
def Precision10(precision) :
    if len(precision) < 10 :
        return 0
    return precision[9]

### MRR

In [17]:
def MRR(reciprocal_rank) :
    if len(reciprocal_rank) == 0 :
        return 0
    
    s = 0
    for rr in reciprocal_rank :
        s += rr

    return s / len(reciprocal_rank)

In [18]:
def assessment(test_data, train_data, qid1, qid2, vector) :
    precision = []
    reciprocal_rank = []
    for i in range(len(vector)) :
        maxs = np.sort(vector[i])[::-1]
        maxs = maxs[0:10]
    
        pr = []
        sum = 0
        flag = True
        for h in range(len(maxs)) :
            index_max_similarity = np.where(vector[i] == maxs[h])[0][0]
    
            for k in range(len(test_data)) :
                if test_data[k][1] == qid1[i] :
                    test_s = test_data[k][-2]
                    
                    for z in range(len(train_data)) :
                        if train_data[z][2] == qid2[index_max_similarity] :
                            train_s = train_data[z][-2]
                    
                            if train_s == test_s :
                                sum += 1
                                pr.append(sum/(h + 1))
                                if flag :
                                    flag = False
                                    reciprocal_rank.append(sum/(h + 1))
                    
        precision.append(pr)
    
    
    for i in range(len(precision)) :
        if len(precision[i]) > 0 :
            print('Precision@5 for query ' + str(i) + ' = ' + str(Precision5(precision[i])) )
            print('Precision@10 for query ' + str(i) + ' = ' + str(Precision10(precision[i])) )
    
    print('MAP = ' + str(MAP(precision)) )
    print('MRR = ' + str(MRR(reciprocal_rank)) )

## Train questions

In [21]:
qid2 = []
for i in range(1, len(train_data)) :
    qid2.append(train_data[i][2])

unique_res = np.unique(qid2[1:]) 

qid2 = list(unique_res) 

In [None]:
train_data[i-1]

['364931',
 '19408',
 '402550',
 '"Mathematical Puzzles: What is () + () + () = 30 using 1',
 '3',
 '5',
 '7',
 '9',
 '11',
 '13',
 '15?"',
 '"How do I Simplify the following matrices:']

In [20]:
train_data.pop(i)
train_data.pop(i-1)
print('Done')

Done


## Unique Train data

In [22]:
train_qs = []
for i in range(1, len(train_data)) :
    train_qs.append(train_data[i][2])

sentence = []
for id in qid2 :
    index = train_qs.index(id) + 1
    sentence.append(train_data[index][-2])

print(len(sentence))

16661


In [23]:
indexes = [sentence.index(x) for x in set(sentence)]

len(indexes)

16475

In [24]:
qid = []
for i in indexes :
    qid.append(qid2[i])

qid2 = qid

len(qid2)

16475

## Test questions

In [25]:
qid1 = np.loadtxt(test_data_AD, delimiter = ",", usecols = 1, dtype = str)

unique_res = np.unique(qid1[1:]) 

qid1 = list(unique_res) 

## Valid questions

In [28]:
qid3 = np.loadtxt(valid_data_AD, delimiter = ",", usecols = 1, dtype = str)

unique_res = np.unique(qid3[1:]) 

qid3 = list(unique_res) 

## TF × IDF

### Calculate Cosine Similarity

In [29]:
test_qs = []
for i in range(1, len(test_data)) :
    test_qs.append(test_data[i][1])

train_qs = []
for i in range(1, len(train_data)) :
    train_qs.append(train_data[i][2])

cos = []
for q1 in qid1 :
    index1 = test_qs.index(q1) + 1

    test_words, test_tf_idf = match_words_with_tf_idf(test_data[index1][3])
    
    c = []
    for q2 in qid2 :
        index2 = train_qs.index(q2) + 1

        train_words, train_tf_idf = match_words_with_tf_idf(train_data[index2][-2])
        
        test_index, train_index = find_same_words(test_words, train_words)

        te = []
        for index in test_index :
            te.append(test_tf_idf[index])

        tr = []
        for index in train_index :
            tr.append(train_tf_idf[index])

        c.append(cosine_similarity(te, tr))
        
    cos.append(c)

  cosine_similarity = dot(x, y)/(norm(x)*norm(y))


### Assessment of Cosine Similarity

In [30]:
for i in range(len(cos)) :
    for j in range(len(cos[i])) :
        if np.isnan(cos[i][j]) :
            cos[i][j] = np.nan_to_num(cos[i][j])

In [31]:
assessment(test_data, train_data, qid1, qid2, cos)

Precision@5 for query 0 = 1.6666666666666667
Precision@10 for query 0 = 2.0
Precision@5 for query 2 = 0
Precision@10 for query 2 = 0
Precision@5 for query 5 = 1.0
Precision@10 for query 5 = 0
Precision@5 for query 6 = 1.6666666666666667
Precision@10 for query 6 = 2.0
Precision@5 for query 7 = 0
Precision@10 for query 7 = 0
Precision@5 for query 11 = 1.0
Precision@10 for query 11 = 0
Precision@5 for query 13 = 0
Precision@10 for query 13 = 0
Precision@5 for query 15 = 0
Precision@10 for query 15 = 0
Precision@5 for query 17 = 0
Precision@10 for query 17 = 0
Precision@5 for query 21 = 0
Precision@10 for query 21 = 0
Precision@5 for query 26 = 0
Precision@10 for query 26 = 0
Precision@5 for query 29 = 0
Precision@10 for query 29 = 0
Precision@5 for query 32 = 0
Precision@10 for query 32 = 0
Precision@5 for query 33 = 0
Precision@10 for query 33 = 0
Precision@5 for query 34 = 0
Precision@10 for query 34 = 0
Precision@5 for query 46 = 0
Precision@10 for query 46 = 0
Precision@5 for query 53

In [None]:
np.savez('./Result/cosine_similarity.npz', cos)

In [None]:
data = np.load('./Result/cosine_similarity.npz')
cos = data['arr_0']

## Unigram

P(Q|D) = .* P(qi|D)

Dirichlet :

P(w|D) = (TFw,D + u CFw/|c|) / (|D| + u)

|D| : length of D

TFw,D : the number of occurrences of w in D

CFw : the number of occurrences of w in the collection

𝑐 : σ𝑤 𝐶𝐹𝑤 : the total number of tokens in the collection

landa = N / (N + u)

1 - landa = u / (N + u)

### Calculate miu

In [None]:
word_counts = []
for i in range(1, len(train_data)) :
    word_counts.append(TF([train_data[i][-2]]))
    
d = {}
for word_count in word_counts :
    for word in word_count.keys() :
        if word in d :
            d[word] += word_count[word]
        else :
            d[word] = word_count[word]

In [None]:
max_pwd = 0
best_u = 0
for v in np.arange(0.1, 2, 0.1) :
    u = np.around(v , decimals=2)

    valid_qs = []
    for i in range(1, len(valid_data)) :
        valid_qs.append(valid_data[i][1])

    train_qs = []
    for i in range(1, len(train_data)) :
        train_qs.append(train_data[i][2])

    for q3 in qid3 :
        index1 = valid_qs.index(q3) + 1

        valid_word_count = TF([valid_data[index1][3]])

        c = C(d, list(valid_word_count.keys()))

        pw = []
        for q2 in qid2 :
            index2 = train_qs.index(q2) + 1

            train_word_count = TF([train_data[index2][-2]])

            p = 1
            for valid_word in valid_word_count.keys() :
                tf = 0
                if valid_word in train_word_count.keys() :
                    tf = train_word_count[valid_word]
                p *= (tf + u * (CF(d, valid_word) / c) ) / (len(list(train_word_count.keys())) + u)

            pw.append(p)

        if max_pwd < max(pw) :
            max_pwd = max(pw)
            best_u = u

print('Best mui : ' + str(best_u))

### Conditional probability

In [32]:
word_counts = []
for i in range(1, len(train_data)) :
    word_counts.append(TF([train_data[i][-2]]))

d = {}
for word_count in word_counts :
    for word in word_count.keys() :
        if word in d :
            d[word] += word_count[word]
        else :
            d[word] = word_count[word]

In [33]:
u = 0.1

test_qs = []
for i in range(1, len(test_data)) :
    test_qs.append(test_data[i][1])

train_qs = []
for i in range(1, len(train_data)) :
    train_qs.append(train_data[i][2])

P_wd = []
for q1 in qid1 :
    index1 = test_qs.index(q1) + 1

    test_word_count = TF([test_data[index1][3]])
    
    c = C(d, list(test_word_count.keys()))

    pw = []
    for q2 in qid2 :
        index2 = train_qs.index(q2) + 1

        train_word_count = TF([train_data[index2][-2]])
        
        p = 1
        for test_word in test_word_count.keys() :
            tf = 0
            if test_word in train_word_count.keys() :
                tf = train_word_count[test_word]
            p *= (tf + u * (CF(train_word_count, test_word) / c) ) / (len(list(train_word_count.keys())) + u)
        
        pw.append(p)

    P_wd.append(pw)

### Assessment of Unigram

In [34]:
assessment(test_data, train_data, qid1, qid2, P_wd)

Precision@5 for query 0 = 0.8333333333333334
Precision@10 for query 0 = 0
Precision@5 for query 2 = 0
Precision@10 for query 2 = 0
Precision@5 for query 15 = 0
Precision@10 for query 15 = 0
Precision@5 for query 17 = 1.25
Precision@10 for query 17 = 1.6666666666666667
Precision@5 for query 18 = 2.5
Precision@10 for query 18 = 0
Precision@5 for query 20 = 0
Precision@10 for query 20 = 0
Precision@5 for query 29 = 0
Precision@10 for query 29 = 0
Precision@5 for query 33 = 0
Precision@10 for query 33 = 0
Precision@5 for query 34 = 0
Precision@10 for query 34 = 0
Precision@5 for query 44 = 0
Precision@10 for query 44 = 0
Precision@5 for query 59 = 0
Precision@10 for query 59 = 0
Precision@5 for query 65 = 1.6666666666666667
Precision@10 for query 65 = 2.0
Precision@5 for query 76 = 0
Precision@10 for query 76 = 0
Precision@5 for query 79 = 0
Precision@10 for query 79 = 0
Precision@5 for query 84 = 0
Precision@10 for query 84 = 0
Precision@5 for query 85 = 0
Precision@10 for query 85 = 0
Pr

In [None]:
np.savez('./Result/PWD_unigram_01.npz', P_wd)

In [None]:
data = np.load('./Result/PWD_unigram_01.npz')
P_wd = data['arr_0']

## Bigram

P(Q|D) = P(q1|D) .* P(qi|qi-1, D)

### Calculate Lambda

In [None]:
word_counts = []
for i in range(1, len(train_data)) :
    word_counts.append(TF([train_data[i][-2]]))

d = {}
for word_count in word_counts :
    for word in word_count.keys() :
        if word in d :
            d[word] += word_count[word]
        else :
            d[word] = word_count[word]

In [None]:
max_pwd = 0
best_lamda = 0
for v in np.arange(0.1, 2, 0.1) :
    l = np.around(v , decimals=2)

    u = 0.1

    valid_qs = []
    for i in range(1, len(valid_data)) :
        valid_qs.append(valid_data[i][1])

    train_qs = []
    for i in range(1, len(train_data)) :
        train_qs.append(train_data[i][2])

    for q3 in qid3 :
        index1 = valid_qs.index(q3) + 1

        valid_word_count = TF([valid_data[index1][3]])

        valid_keys = list(valid_word_count.keys())

        c = C(d, valid_keys)

        pw = []
        for q2 in qid2 :
            index2 = train_qs.index(q2) + 1

            train_word_count = TF([train_data[index2][-2]])

            if valid_keys[0] in train_word_count.keys() :
                    tf = train_word_count[valid_keys[0]]
            else : tf = 0

            p = (tf + u * (CF(train_word_count, valid_keys[0]) / c) ) / (len(list(train_word_count.keys())) + u)

            for i in range(2, len(valid_keys)) :
                valid_word_j = valid_keys[i-1]
                valid_word_i = valid_keys[i]
                tf = 0
                if valid_word_i in train_word_count.keys() :
                    tf = train_word_count[valid_word_i]

                if valid_word_j in train_word_count.keys() :
                    unigram = (tf + u * (CF(train_word_count, valid_word_i) / c) ) / (len(list(train_word_count.keys())) + u)
                    p *= (l * (C(train_word_count, [valid_word_j, valid_word_i]) / CF(train_word_count, valid_word_j))) + (l * unigram)

            pw.append(p)

        if max_pwd < max(pw) :
                max_pwd = max(pw)
                best_lamda = l

print('Best landa : ' + str(best_lamda))

### Conditional probability

In [35]:
word_counts = []
for i in range(1, len(train_data)) :
    word_counts.append(TF([train_data[i][-2]]))

d = {}
for word_count in word_counts :
    for word in word_count.keys() :
        if word in d :
            d[word] += word_count[word]
        else :
            d[word] = word_count[word]

In [36]:
l = 1.9
u = 0.1

test_qs = []
for i in range(1, len(test_data)) :
    test_qs.append(test_data[i][1])

train_qs = []
for i in range(1, len(train_data)) :
    train_qs.append(train_data[i][2])

P_wd = []
for q1 in qid1 :
    index1 = test_qs.index(q1) + 1

    test_word_count = TF([test_data[index1][3]])
        
    test_keys = list(test_word_count.keys())

    c = C(d, test_keys)

    pw = []
    for q2 in qid2 :
        index2 = train_qs.index(q2) + 1

        train_word_count = TF([train_data[index2][-2]])
        
        if test_keys[0] in train_word_count.keys() :
                tf = train_word_count[test_keys[0]]
        else : tf = 0
        
        p = (tf + u * (CF(train_word_count, test_keys[0]) / c) ) / (len(list(train_word_count.keys())) + u)

        for i in range(2, len(test_keys)) :
            test_word_j = test_keys[i-1]
            test_word_i = test_keys[i]
            tf = 0
            if test_word_i in train_word_count.keys() :
                tf = train_word_count[test_word_i]
            
            if test_word_j in train_word_count.keys() :
                unigram = (tf + u * (CF(train_word_count, test_word_i) / c) ) / (len(list(train_word_count.keys())) + u)
                p *= (l * (C(train_word_count, [test_word_j, test_word_i]) / CF(train_word_count, test_word_j))) + (l * unigram)
        
        pw.append(p)

    P_wd.append(pw)

### Assessment of Bigram

In [37]:
assessment(test_data, train_data, qid1, qid2, P_wd)

Precision@5 for query 0 = 0.8333333333333334
Precision@10 for query 0 = 0
Precision@5 for query 3 = 0
Precision@10 for query 3 = 0
Precision@5 for query 4 = 0
Precision@10 for query 4 = 0
Precision@5 for query 6 = 1.6666666666666667
Precision@10 for query 6 = 3.3333333333333335
Precision@5 for query 8 = 0
Precision@10 for query 8 = 0
Precision@5 for query 9 = 2.5
Precision@10 for query 9 = 2.5
Precision@5 for query 18 = 2.5
Precision@10 for query 18 = 0
Precision@5 for query 20 = 0
Precision@10 for query 20 = 0
Precision@5 for query 21 = 0
Precision@10 for query 21 = 0
Precision@5 for query 24 = 2.5
Precision@10 for query 24 = 3.3333333333333335
Precision@5 for query 26 = 0
Precision@10 for query 26 = 0
Precision@5 for query 29 = 0
Precision@10 for query 29 = 0
Precision@5 for query 30 = 0
Precision@10 for query 30 = 0
Precision@5 for query 31 = 0
Precision@10 for query 31 = 0
Precision@5 for query 33 = 0
Precision@10 for query 33 = 0
Precision@5 for query 34 = 0
Precision@10 for query

In [32]:
np.savez('./Result/PWD_bigram_19_01.npz', P_wd)

In [26]:
data = np.load('./Result/PWD_bigram_19_01.npz')
P_wd = data['arr_0']