In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import ast
import os
import pickle
import shutil
import random

from scipy.spatial import distance

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords

In [None]:
# read database and samples
df_database = pd.read_pickle("./df_database.pkl")
df_sample = pd.read_pickle("./df_sample.pkl")

In [None]:
sample_sources = list(df_sample['source'])
base_sources = list(df_database['source'])

In [None]:
# nb of letters to consider in cim10 codes
nb_letter = 3
# dic where keys are sources and values are cim10 codes for this doc
dic_source_cim10 = {}
df_base_new = pd.concat([df_database,df_sample]).reset_index()
for i in range(len(df_base_new)):
    source = df_base_new.loc[i,'source']
    cim10s = ast.literal_eval(df_base_new.loc[i,'list_cim10'])
    # cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s])) # at least one same DP/DAS
    # cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s if x.startswith('DP')])) # at least one same DP
    cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s if x.startswith('DAS')])) # at least one same DAS
    dic_source_cim10[source] = cim10s

# doc2vec
use doc2vec to find top k candidates for each sample in database, see how many of these candidates have at least one same DP/DAS as the sample itself.

In [None]:
dim = 300 # dimension of vectors
dm = 0 # (0 for dbow, 1 for dm)
stop_words = set(stopwords.words('french'))

In [None]:
# preprocess sample docs: tokenization, delete stop words and non alphabetic words, lower case
sample_docs = [x.split() for x in list(df_sample['observation_blob'])]
sample_docs = [[l.lower() for l in x if l.isalpha() and l not in stop_words] for x in sample_docs]

In [None]:
# doc2vec for samples
tagged_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(sample_docs)]
model = Doc2Vec(tagged_data, vector_size=dim, window=5,
                dm=dm, min_count=2, epochs=100, workers=10)

In [None]:
vector_samples = []
for doc in tqdm(sample_docs):
    vector_samples.append(model.infer_vector(doc))

In [None]:
# preprocess database docs: tokenization, delete stop words and non alphabetic words, lower case
base_docs = [str(x).split() for x in list(df_database['observation_blob'])]
base_docs = [[l.lower() for l in x if l.isalpha() and l not in stop_words] for x in base_docs]

In [None]:
# doc2vec for database docs
tagged_data_base = [TaggedDocument(doc, [i]) for i, doc in enumerate(base_docs)]
model_base = Doc2Vec(tagged_data_base, vector_size=dim, window=5,
                dm=dm, min_count=2, epochs=100, workers=10)

In [None]:
vector_base = []
for doc in tqdm(base_docs):
    vector_base.append(model_base.infer_vector(doc))

In [None]:
# for each sample, find top nb candidates from database
nb = 100
vec_base = np.array(vector_base)
dic_sample_cands = {}
for i in tqdm(range(len(sample_sources))):
    vec_sample = np.array(vector_samples[i])
    sim = np.matmul(vec_sample,vec_base.T)
    idx = torch.topk(torch.tensor(sim), k=nb, sorted=True).indices
    cands = [base_sources[x] for x in idx]
    dic_sample_cands[sample_sources[i]] = cands

# Evaluation

In [None]:
accs = []
for sample in tqdm(dic_sample_cands):
    sample_cim10 = dic_source_cim10[sample]
    hit = 0
    for cand in dic_sample_cands[sample]:
        cand_cim10 = dic_source_cim10[cand]
        if len([x for x in cand_cim10 if x in sample_cim10])>0:
            hit+=1
    acc = hit/len(dic_sample_cands[sample])
    accs.append(acc)

In [None]:
np.mean(accs)