In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import ast
import os
import pickle
import shutil
import random

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# read database and samples
df_database = pd.read_pickle("./df_database.pkl")
df_sample = pd.read_pickle("./df_sample.pkl")

In [None]:
sample_sources = list(df_sample['source'])
base_sources = list(df_database['source'])

In [None]:
# nb of letters to consider in cim10 codes
nb_letter = 3
# dic where keys are sources and values are cim10 codes for this doc
dic_source_cim10 = {}
df_base_new = pd.concat([df_database,df_sample]).reset_index()
for i in range(len(df_base_new)):
    source = df_base_new.loc[i,'source']
    cim10s = ast.literal_eval(df_base_new.loc[i,'list_cim10'])
    # cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s])) # at least one same DP/DAS
    # cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s if x.startswith('DP')])) # at least one same DP
    cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s if x.startswith('DAS')])) # at least one same DAS
    dic_source_cim10[source] = cim10s

# bert
use bert to find top k candidates for each sample in database, see how many of these candidates have at least one same DP/DAS as the sample itself.

In [None]:
modelPath = "../camembert-large"
model = SentenceTransformer(modelPath, device='cuda')

In [None]:
# Input Sequence Length: 
# A common value for BERT & Co. are 512 word pieces, which corresponde to about 300-400 words (for English). 
# Longer texts than this are truncated to the first x word pieces.
model.max_seq_length = 512

In [None]:
sample_embeddings = []
for sentence in tqdm(list(df_sample['observation_blob'])):
    sample_embeddings.append(model.encode(sentence))

In [None]:
base_embeddings = []
for sentence in tqdm(list(df_database['observation_blob'])):
    base_embeddings.append(model.encode(sentence))

In [None]:
# for each sample, find top nb candidates from database
nb = 100

sim = np.matmul(np.array(sample_embeddings),np.array(base_embeddings).T)
candidates = torch.topk(torch.tensor(sim), k=nb, dim=1, sorted=True).indices
pred_cands = [[base_sources[idx] for idx in candidate] for candidate in candidates]

In [None]:
dic_sample_cands = {}
for i in range(len(sample_sources)):
    dic_sample_cands[sample_sources[i]] = pred_cands[i]

# Evaluation

In [None]:
accs = []
for sample in tqdm(dic_sample_cands):
    sample_cim10 = dic_source_cim10[sample]
    hit = 0
    for cand in dic_sample_cands[sample]:
        cand_cim10 = dic_source_cim10[cand]
        if len([x for x in cand_cim10 if x in sample_cim10])>0:
            hit+=1
    acc = hit/len(dic_sample_cands[sample])
    accs.append(acc)

In [None]:
np.mean(accs)