In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import ast
import os
import pickle
import shutil
import random

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# read the whole database after filtering documents
df_base = pd.read_csv('../resources/database_doc_type.csv')
df_base.head()

In [None]:
# filter doc_type (only keep CR:CRH-HOSPI)
# base 3
df_base = df_base[df_base['doc_type']=='CR:CRH-HOSPI']
len(df_base)

In [None]:
df_base_new = df_base.drop_duplicates(subset=['observation_blob'])
df_base_new.dropna(subset=['observation_blob'],inplace=True)
df_base_new.reset_index(inplace=True)
df_base_new.drop(columns=['index','Unnamed: 0'],inplace=True)
df_base_new.reset_index(inplace=True)
df_base_new.rename(columns={'index':'source'},inplace=True)
df_base_new

In [None]:
# database too large, choose 10000 docs randomly to form a smaller database
df_base_small = df_base_new.sample(n=10000,replace=False, random_state=22)

In [None]:
# randomly select 100 samples
df_sample = df_base_small.sample(n=100, replace=False, random_state=2022)
df_sample

In [None]:
# the rest docs form a databse
df_database = pd.concat([df_base_small,df_sample,df_sample]).drop_duplicates(keep=False)
df_database

In [None]:
# save database and samples
df_database.to_pickle("./df_database.pkl")
df_sample.to_pickle("./df_sample.pkl")

In [None]:
sample_sources = list(df_sample['source'])
base_sources = list(df_database['source'])

In [None]:
# nb of letters to consider in cim10 codes
nb_letter = 3
# dic where keys are sources and values are cim10 codes for this doc
dic_source_cim10 = {}
for i in range(len(df_base_new)):
    source = df_base_new.loc[i,'source']
    cim10s = ast.literal_eval(df_base_new.loc[i,'list_cim10'])
    # cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s])) # at least one same DP/DAS
    cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s if x.startswith('DP')])) # at least one same DP
    # cim10s = list(set([x.split(':')[-1][:nb_letter] for x in cim10s if x.startswith('DAS')])) # at least one same DAS
    dic_source_cim10[source] = cim10s

# TF-IDF
use TF-IDF to find top k candidates for each sample in database, see how many of these candidates have at least one same DP/DAS as the sample itself.

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list(df_sample['observation_blob'])+list(df_database['observation_blob']))
vectors = X.toarray()
vector_sample = vectors[:len(list(df_sample['observation_blob']))]
vector_base = vectors[len(list(df_sample['observation_blob'])):]

In [None]:
# for each sample, find top nb candidates from database
nb = 100

sim = np.matmul(vector_sample,vector_base.T)
candidates = torch.topk(torch.tensor(sim), k=nb, dim=1, sorted=True).indices
pred_cands = [[base_sources[idx] for idx in candidate] for candidate in candidates]

In [None]:
dic_sample_cands = {}
for i in range(len(sample_sources)):
    dic_sample_cands[sample_sources[i]] = pred_cands[i]

# Evaluation

In [None]:
accs = []
for sample in tqdm(dic_sample_cands):
    sample_cim10 = dic_source_cim10[sample]
    hit = 0
    for cand in dic_sample_cands[sample]:
        cand_cim10 = dic_source_cim10[cand]
        if len([x for x in cand_cim10 if x in sample_cim10])>0:
            hit+=1
    acc = hit/len(dic_sample_cands[sample])
    accs.append(acc)

In [None]:
np.mean(accs)