In [7]:
import sklearn
from sklearn import metrics
import codecs
import numpy as np
import sys

In [2]:
def cos_sim(first, second):
    return metrics.pairwise.cosine_similarity(first.reshape(1, -1), second.reshape(1, -1))[0][0]

In [3]:
path = '/Users/annanesterenko/Desktop/diplom'

In [8]:
sys.path.append(path)

### ARXIV

In [6]:
def evaluate_quality(embeddings_file, triples_file=f"{path}/data/ARXIV/arxiv_triplets.txt", func=cos_sim):
    all_triplets = 0
    covered_triplets = 0
    correct_triplets = 0
    
    embeddings = []
    id2tag = {}
    with codecs.open(embeddings_file) as f:
        for line in f:
            id_, embedding = line.split(' ', maxsplit=1)
            id2tag[id_[2:]] = len(embeddings)
            embeddings.append(embedding)

    with codecs.open(triples_file) as fin:
        for line in fin:
            id1, id2, id3 = map(lambda x: x.split('/pdf/')[-1], line.split())
            if id1 in id2tag and id2 in id2tag and id3 in id2tag:
                covered_triplets += 1
                t1, t2, t3 = id2tag[id1], id2tag[id2], id2tag[id3]
                query = np.array(list(map(float, embeddings[t1].split())))
                good_req = np.array(list(map(float, embeddings[t2].split())))
                bad_req = np.array(list(map(float, embeddings[t3].split())))
                correct_triplets += func(query, good_req) > cos_sim(query, bad_req)
            all_triplets += 1
    return 1.0 * correct_triplets / covered_triplets

In [5]:
evaluate_quality(f"{path}/data/ARXIV/embeddings_doc2vec.txt")

0.7862865072856873

In [67]:
evaluate_quality("data/embeddings_fasttext.txt")

0.8546900643858099

### MIND

In [9]:
from CapsE.prepare_ import *

In [82]:
def evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets,
                     func=cos_sim):
    
    all_triplets = len(test_duplets)
    recall = 0
    precision = 0
    
    for i in range(len(test_duplets)):
        ranked = []
        query = lst_embeddings_query[test_duplets[i][0][0]] 

        relevant_cnt = 0
        for j in range(10):
            ans = lst_embeddings_doc[test_duplets[i][j][1]]
            cos_sim = func(query, ans)
            
            ranked.append([cos_sim, test_val_duplets[i][j][0]])
            relevant_cnt += test_val_duplets[i][j][0]
            
        ranked.sort(key=lambda x: x[0], reverse=True)
        ranked_relevance = [i[1] for i in ranked]
        
        precision += sum(ranked_relevance[:relevant_cnt]) / relevant_cnt
        recall += (sum(ranked_relevance[:relevant_cnt]) == relevant_cnt)
        
    return recall / all_triplets, precision / all_triplets

In [83]:
lst_embeddings_query, lst_embeddings_doc, \
           train_duplets, test_duplets, \
           train_val_duplets, test_val_duplets = get_data_for_net_MIND(
                                        embeddings_file_train=f'{path}/data/MIND/embeddings_doc2vec_train.txt', 
                                        embeddings_file_test=f'{path}/data/MIND/embeddings_doc2vec_test.txt',
                                        behaviors_train=f'{path}/data/MIND/behaviors_train.tsv',
                                        behaviors_test=f'{path}/data/MIND/behaviors_test.tsv')

In [84]:
evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets)

(0.017666666666666667, 0.4494007936507956)

In [85]:
lst_embeddings_query, lst_embeddings_doc, \
           train_duplets, test_duplets, \
           train_val_duplets, test_val_duplets = get_data_for_net_MIND(
                                        embeddings_file_train=f'{path}/data/MIND/embeddings_fasttext_train.txt', 
                                        embeddings_file_test=f'{path}/data/MIND/embeddings_fasttext_test.txt',
                                        behaviors_train=f'{path}/data/MIND/behaviors_train.tsv',
                                        behaviors_test=f'{path}/data/MIND/behaviors_test.tsv')

In [86]:
evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets)

(0.02033333333333333, 0.44723214285714397)

In [88]:
lst_embeddings_query, lst_embeddings_doc, \
           train_duplets, test_duplets, \
           train_val_duplets, test_val_duplets = get_data_for_net_MIND(
                                        embeddings_file_train=f'{path}/data/MIND/embeddings_BERT_train.txt', 
                                        embeddings_file_test=f'{path}/data/MIND/embeddings_BERT_test.txt',
                                        behaviors_train=f'{path}/data/MIND/behaviors_train.tsv',
                                        behaviors_test=f'{path}/data/MIND/behaviors_test.tsv', 
                                        SIZE=768)

In [89]:
evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets)

(0.021166666666666667, 0.4227432539682545)