In [20]:
import sklearn
from sklearn import metrics
import codecs
import numpy as np
import sys

In [21]:
def cos_sim(first, second):
    return metrics.pairwise.cosine_similarity(first.reshape(1, -1), second.reshape(1, -1))[0][0]

In [22]:
path = '/Users/annanesterenko/Desktop/diplom_final'

In [23]:
sys.path.append(path)

### ARXIV

In [24]:
def evaluate_quality(embeddings_file, triples_file=f"{path}/data/ARXIV/arxiv_triplets.txt", func=cos_sim):
    all_triplets = 0
    covered_triplets = 0
    correct_triplets = 0
    
    embeddings = []
    id2tag = {}
    with codecs.open(embeddings_file) as f:
        for line in f:
            id_, embedding = line.split(' ', maxsplit=1)
            id2tag[id_[2:]] = len(embeddings)
            embeddings.append(embedding)

    with codecs.open(triples_file) as fin:
        for line in fin:
            id1, id2, id3 = map(lambda x: x.split('/pdf/')[-1], line.split())
            if id1 in id2tag and id2 in id2tag and id3 in id2tag:
                covered_triplets += 1
                t1, t2, t3 = id2tag[id1], id2tag[id2], id2tag[id3]
                query = np.array(list(map(float, embeddings[t1].split())))
                good_req = np.array(list(map(float, embeddings[t2].split())))
                bad_req = np.array(list(map(float, embeddings[t3].split())))
                correct_triplets += func(query, good_req) > cos_sim(query, bad_req)
            all_triplets += 1
    return 1.0 * correct_triplets / covered_triplets

In [5]:
evaluate_quality(f"{path}/data/ARXIV/embeddings_doc2vec.txt")

0.7862865072856873

In [67]:
evaluate_quality("data/embeddings_fasttext.txt")

0.8546900643858099

### MIND

In [25]:
from CapsE.prepare_3 import *

In [35]:
def evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets,
                     func=cos_sim):
    
    all_triplets = len(test_duplets)
    recall = 0
    recall1 = 0
    recall2 = 0
    recall3 = 0
    precision = 0
    
    ndcg3 = 0
    ndcg5 = 0
    ndcg10 = 0
    
    for i in range(len(test_duplets)):
        ranked = []
        query = lst_embeddings_query[test_duplets[i][0][0]] 

        relevant_cnt = 0
        for j in range(10):
            ans = lst_embeddings_doc[test_duplets[i][j][1]]
            cos_sim = func(query, ans)
            
            ranked.append([cos_sim, test_val_duplets[i][j][0]])
            relevant_cnt += test_val_duplets[i][j][0]
            
        ranked.sort(key=lambda x: x[0], reverse=True)
        ranked_relevance = [i[1] for i in ranked]
        
        precision += sum(ranked_relevance[:relevant_cnt]) / relevant_cnt
        recall += (sum(ranked_relevance[:relevant_cnt]) == relevant_cnt)
        recall1 += (ranked_relevance[0] == 1)
        recall2 += (sum(ranked_relevance[:2]) == 2)
        recall3 += (sum(ranked_relevance[:3]) == 3)
        
        #ndcg@3
        ndcg3 += sum([ranked_relevance[k-1] / np.log2(k+1) for k in range(1, 3)]) \
            / sum([1/np.log2(k+1) for k in range(1, 6)])
        #ndcg@5
        ndcg5 += sum([ranked_relevance[k-1] / np.log2(k+1) for k in range(1, 6)]) \
            / sum([1/np.log2(k+1) for k in range(1, 6)])
        #ndcg@10
        ndcg10 += sum([ranked_relevance[k-1] / np.log2(k+1) for k in range(1, 11)]) \
            / sum([1/np.log2(k+1) for k in range(1, 6)])
        
    return recall / all_triplets, precision / all_triplets, recall1 / all_triplets, \
            recall2 / all_triplets, recall3 / all_triplets, ndcg3 / all_triplets, \
            ndcg5 / all_triplets, ndcg10 / all_triplets,

In [27]:
lst_embeddings_query, lst_embeddings_doc, \
           train_duplets, test_duplets, \
           train_val_duplets, test_val_duplets = get_data_for_net_MIND(
                                        embeddings_file_train=f'{path}/data/MIND/embeddings_doc2vec_train.txt', 
                                        embeddings_file_test=f'{path}/data/MIND/embeddings_doc2vec_test.txt',
                                        behaviors_train=f'{path}/data/MIND/behaviors_train.tsv',
                                        behaviors_test=f'{path}/data/MIND/behaviors_test.tsv')

In [29]:
lst_embeddings_query.shape

(44908, 200)

In [28]:
test_duplets[0]

array([[34677, 11556],
       [34677,   918],
       [34677, 12260],
       [34677,  3151],
       [34677,  1299],
       [34677, 12518],
       [34677,  2180],
       [34677,  8609],
       [34677,  5096],
       [34677,  4491]])

In [41]:
train_val_duplets

array([[[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]]])

In [36]:
evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets)

(0.016166666666666666,
 0.4498440476190485,
 0.4633333333333333,
 0.201,
 0.08166666666666667,
 0.25621986901554084,
 0.44641964276891666,
 0.6332252710803599)

In [37]:
lst_embeddings_query, lst_embeddings_doc, \
           train_duplets, test_duplets, \
           train_val_duplets, test_val_duplets = get_data_for_net_MIND(
                                        embeddings_file_train=f'{path}/data/MIND/embeddings_fasttext_train.txt', 
                                        embeddings_file_test=f'{path}/data/MIND/embeddings_fasttext_test.txt',
                                        behaviors_train=f'{path}/data/MIND/behaviors_train.tsv',
                                        behaviors_test=f'{path}/data/MIND/behaviors_test.tsv')

In [38]:
evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets)

(0.019666666666666666,
 0.4446186507936511,
 0.4525,
 0.19566666666666666,
 0.08133333333333333,
 0.2512617158700025,
 0.4405059394615861,
 0.6299780387146978)

In [39]:
lst_embeddings_query, lst_embeddings_doc, \
           train_duplets, test_duplets, \
           train_val_duplets, test_val_duplets = get_data_for_net_MIND(
                                        embeddings_file_train=f'{path}/data/MIND/embeddings_BERT_train.txt', 
                                        embeddings_file_test=f'{path}/data/MIND/embeddings_BERT_test.txt',
                                        behaviors_train=f'{path}/data/MIND/behaviors_train.tsv',
                                        behaviors_test=f'{path}/data/MIND/behaviors_test.tsv', 
                                        SIZE=768)

In [40]:
evaluate_quality(lst_embeddings_query, 
                     lst_embeddings_doc, 
                     test_duplets,
                     test_val_duplets)

(0.024666666666666667,
 0.42286111111111174,
 0.4698333333333333,
 0.19716666666666666,
 0.08116666666666666,
 0.2509705554615659,
 0.4222677848005323,
 0.5938811566894348)