# Library

In [1]:
import numpy as np
import csv
from sentence_transformers import SentenceTransformer
from math import sqrt, pow
from gensim.parsing.preprocessing import remove_stopwords

# Code

## Read data

In [2]:
test_data_AD = './Data/test_data.csv'
train_data_AD = './Data/train_data.csv'
valid_data_AD = './Data/valid_data.csv'

In [3]:
#Test Data

file = open(test_data_AD)

csvreader = csv.reader(file)

header = []
header = next(csvreader)
header

test_data = []
for row in csvreader:
        test_data.append(row)
test_data

file.close()

In [4]:
#Train Data

file = open(train_data_AD)

csvreader = csv.reader(file)

header = []
header = next(csvreader)
header

train_data = []
for row in csvreader:
        train_data.append(row)
train_data

file.close()

In [5]:
#Validation Data

file = open(valid_data_AD)

csvreader = csv.reader(file)

header = []
header = next(csvreader)
header

valid_data = []
for row in csvreader:
        valid_data.append(row)
valid_data

file.close()

## Unique Train data

In [6]:
qid2 = []
for i in range(len(train_data)) :
    qid2.append(train_data[i][2])

unique_res = list(np.unique(qid2))

train_sentence = []
for id in unique_res :
    index = qid2.index(id)
    train_sentence.append(train_data[index][4].lower())

print(len(qid2))
print(len(train_sentence))

37250
16663


In [7]:
indexes = [train_sentence.index(x) for x in set(train_sentence)]
train_sentence = list(set(train_sentence))

print(len(indexes))
print(len(train_sentence))

16641
16641


In [8]:
unique_qid2 = []
for i in indexes :
    unique_qid2.append(unique_res[i])

len(unique_qid2)

16641

## Unique Test questions

In [9]:
qid1 = []
for i in range(len(test_data)) :
    qid1.append(test_data[i][1])

unique_qid1 = list(np.unique(qid1))

test_sentence = []
for id in unique_qid1 :
    index = qid1.index(id)
    test_sentence.append(test_data[index][3].lower())

print(len(qid1))
print(len(test_sentence))

980
146


## Unique Valid questions

In [10]:
qid3 = []
for i in range(len(valid_data)) :
    qid3.append(valid_data[i][1])

qid3 = list(np.unique(qid3))

print(len(qid3))

173


## Assessment

#### MAP

In [11]:
def MAP(precision) :
    if len(precision) == 0 :
        return 0
    
    s = 0
    for pre in precision :
        if len(pre) > 0 :
            x = 0
            for i in pre :
                x += i
            s += ( x / len(pre) )
        
    return s / len(precision)

#### P@k

In [12]:
def Precision5(precision) :
    if len(precision) < 5 :
        return 0
    return precision[4]

In [13]:
def Precision10(precision) :
    if len(precision) < 10 :
        return 0
    return precision[9]

#### MRR

In [14]:
def MRR(reciprocal_rank, size) :
    if len(reciprocal_rank) == 0 :
        return 0
    
    s = 0
    for rr in reciprocal_rank :
        s += rr

    return s / size

#### Calculate

In [15]:
def assessment(vector) :
    precision = []
    reciprocal_rank = []
    for i in range(len(vector)) :
        sort_index = np.argsort(vector[i])[::-1]
        sort_index = sort_index[0:10]

        index_pos_list = [ j for j in range(len(qid1)) if qid1[j] == unique_qid1[i] ]
        test_s = [test_data[j][-2].lower() for j in index_pos_list]
    
        pr = []
        sum = 0
        h = 0
        flag = True
        for index in sort_index :
            for z in range(len(train_data)) :
              if qid2[z] == unique_qid2[index] :
                train_s = train_data[z][-2].lower()
                if train_s in test_s :
                    sum += 1
                    pr.append(sum/(h + 1))
                    if flag :
                        flag = False
                        reciprocal_rank.append(1/(h + 1))
                h += 1
        precision.append(pr)
    
    
    for i in range(len(precision)) :
        print('Precision@5 for query ' + str(i) + ' = ' + str(Precision5(precision[i])) )
        print('Precision@10 for query ' + str(i) + ' = ' + str(Precision10(precision[i])) )
    
    print('MAP = ' + str(MAP(precision)) )
    print('MRR = ' + str(MRR(reciprocal_rank, len(precision))) )

## Preprocess

In [16]:
def remove_stop_words(sentences) :
    for i in range(len(sentences)) :
        sentences[i] = remove_stopwords(sentences[i].lower())
    
    return sentences

## Bert

### Methods

In [17]:
def squared_sum(x):
  """ return 3 rounded square rooted value """
 
  return round(sqrt(sum([a*a for a in x])),3)
 
def euclidean_distance(x,y):
  """ return euclidean distance between two lists """
 
  return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

In [18]:
def cosine_similarity(x,y):
  """ return cosine similarity between two lists """
 
  numerator = sum(a*b for a,b in zip(x,y))
  denominator = squared_sum(x)*squared_sum(y)
  return round(numerator/float(denominator),3)

In [19]:
def bert(sentences) :
  model = SentenceTransformer('bert-base-nli-mean-tokens')

  sentence_embeddings = model.encode(sentences)

  return sentence_embeddings

### Calculate

In [20]:
train_sentences = remove_stop_words(train_sentence)

#se : sentence_embeddings
train_se = bert(train_sentences)

In [21]:
test_sentences = remove_stop_words(test_sentence)

#se : sentence_embeddings
test_se = bert(test_sentences)

In [22]:
cos = []
for test in test_se :
  c = []
  for train in train_se :
    c.append(cosine_similarity(test, train))
  
  cos.append(c)

In [23]:
assessment(cos)

Precision@5 for query 0 = 0
Precision@10 for query 0 = 0
Precision@5 for query 1 = 0
Precision@10 for query 1 = 0
Precision@5 for query 2 = 0
Precision@10 for query 2 = 0
Precision@5 for query 3 = 0
Precision@10 for query 3 = 0
Precision@5 for query 4 = 0
Precision@10 for query 4 = 0
Precision@5 for query 5 = 0
Precision@10 for query 5 = 0
Precision@5 for query 6 = 0
Precision@10 for query 6 = 0
Precision@5 for query 7 = 0
Precision@10 for query 7 = 0
Precision@5 for query 8 = 0
Precision@10 for query 8 = 0
Precision@5 for query 9 = 0
Precision@10 for query 9 = 0
Precision@5 for query 10 = 0
Precision@10 for query 10 = 0
Precision@5 for query 11 = 0
Precision@10 for query 11 = 0
Precision@5 for query 12 = 0
Precision@10 for query 12 = 0
Precision@5 for query 13 = 0
Precision@10 for query 13 = 0
Precision@5 for query 14 = 0
Precision@10 for query 14 = 0
Precision@5 for query 15 = 0
Precision@10 for query 15 = 0
Precision@5 for query 16 = 0
Precision@10 for query 16 = 0
Precision@5 for q