In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import ast
import os
import pickle
import shutil
import random
import glob

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# read database
df_database = pd.read_pickle("/export/home/cse200093/Expe_doc2vec/df_database.pkl").reset_index()

In [None]:
dic_doc_text = {}
for i in tqdm(range(len(df_database))):
    source = df_database.loc[i,'source']
    text = df_database.loc[i,'observation_blob']
    dic_doc_text[source] = text

In [None]:
# read samples in brat_data/CRH_VAL_PHENO
sample_path = '/export/home/cse200093/brat_data/CRH_VAL_PHENO'
os.chdir(sample_path)
my_files = glob.glob('*.ann')
len(my_files)

In [None]:
df_sample = pd.DataFrame(columns=['source','term','label'])
sources = []
terms = []
labels = []
for source in my_files:
    file1 = open(sample_path+'/'+source, 'r')
    lines = file1.readlines()
    for line in lines:
        sources.append(source)
        terms.append(line.split('\t')[-1].rstrip('\n'))
        labels.append(line.split('	')[1].split(' ')[0])
        
df_sample['source'] = sources
df_sample['term'] = terms
df_sample['label'] = labels

In [None]:
# save df_sample
df_sample.to_pickle('/export/home/cse200093/Expe_doc2vec/df_sample_expe1.pkl')

In [None]:
labels_to_keep = ['PINS_Sclerodermie','osteoporose','nephro_lupus','pneumopathie_infectieuse']
# docs for each label
dic_label_sample = {}
for i in tqdm(range(len(df_sample))):
    label = df_sample.loc[i,'label']
    source = df_sample.loc[i,'source']
    if label in labels_to_keep:
        if label in list(dic_label_sample.keys()):
            dic_label_sample[label].append(source)
        else:
            dic_label_sample[label] = [source]
            
for label in labels_to_keep:
    dic_label_sample[label] = list(set(dic_label_sample[label]))

In [None]:
dic_doc_text_sample = {}
for source in my_files:
    file1 = open(sample_path+'/'+source.replace('ann','txt'), 'r')
    dic_doc_text_sample[source] = file1.read()

# key word search (to be compared with)

In [None]:
# nephro_lupus or nephro+lupus
nephro_lupus = ['néphrotpathie lupique', 'Néphrotpathie lupique', 'glomérulonéphrite lupique', 'Glomérulonéphrite lupique',"Lupus avec atteinte rénale", "lupus avec atteinte rénale", 
                "atteinte rénale du lupus", "Atteinte rénale du lupus", 'Insuffisance rénale secondaire au lupus',
                'insuffisance rénale secondaire au lupus','Glomérulopathie lupique', 'glomérulopathie lupique',
                'gn lupique', 'GN lupique', 'atteinte rénale classe IV','Atteinte rénale classe IV', 'atteinte rénale classe V',
                'Atteinte rénale classe V', 'Atteinte rénale classe III', 'Atteinte rénale classe VI',
                'atteinte rénale classe III', 'atteinte rénale classe VI', 'glomérulonéphrite extra membraneuse classe V',
               'Glomérulonéphrite extra membraneuse classe V']
nephro = ['glomérulonéphrite','Glométulonéphrite', 'insuffisance rénale chronique', 'Insuffisance rénale chronique', 'Maladie rénale chronique', 'maladie rénale chronique', 'GEM', 'HSF', 'hyalinose segmentaire et focale','Hyalinose segmentaire et focale', 'atteinte rénale', 'Atteinte rénale']
lupus = ['lupus', 'Lupus']

osteoporose = ['ostéoporose', 'Ostéoporose', 'ostéoporotique']

pneumopathie_infectieuse = ["pneumopathie d'inhalation","Pneumopathie d'inhalation" ,'pneumopathie à', 'Pneumopathie à','légionellose', 'Légionellose', 'infection pulmonaire','Infection pulmonaire', 'Pneumopathie infectieuse', 'pneumopathie infectieuse', 'pneumopathie acquise sous ventilation mécanique','Pneumopathie acquise sous ventilation mécanique','PAVM', 'pneumonie','Pneumonie', 'pneumopathie bilatérale','Pneumopathie bilatérale', 'pneumopathie basale',"Pneumopathie basale", 'pneumopathie bi-basale', 'Pneumopathie bi-basale', 'pneumopathie de la base', 'Pneumopathie de la base', 'PFLA', 'pneumopathie franche lobaire aiguë', 'Pneumopathie franche lobaire aiguë','pneumopathie communautaire','Pneumopathie communautaire','pneumopathie aiguë','Pneumopathie documentée', 'pneumopathie documentée', 'pneumopathie acquise sous ventilation mécanique', 'sepsis à point de départ pulmonaire','Sepsis à point de départ pulmonaire', 'choc septique à point de départ pulmonaire','Choc septique à point de départ pulmonaire', 'pneumopathie lobaire supérieure','Pneumopathie lobaire supérieure','pneumopathie nécrosante', 'Pneumopathie nécrosante','Broncho-pneumopathie','broncho-pneumopathie', 'bronchopneumopathie', "Bronchopneumopathie"]

# PINS+scl
PINS = ['atteinte pulmonaire interstitielle', 'pneumopathie interstitielle', 'syndrome interstitiel', 'atteinte pulmonaire', 'PID', 'PINS', 'fibrose pulmonaire', 'fibrose interstitielle', 'FID', 'atteinte interstitielle', 'pneumopathie fibrosante']
scl = ['sclérodermie systémique','Sclérodermie', 'Scc', 'sclérodermie cutanée diffuse', 'sclérodermie cutanée limitée', 'syndrome CREST', 'CREST', 'sclerodermie']

In [None]:
dic_type_kw = {
    'nephro_lupus':[nephro_lupus, nephro, lupus],
    'osteoporose':[osteoporose],
    'pneumopathie_infectieuse':[pneumopathie_infectieuse],
    'PINS_Sclerodermie':[PINS,scl]
}

In [None]:
def jugde_expected(type_name, text):
    text = str(text).lower()
    if type_name == 'nephro_lupus':
        for word in dic_type_kw[type_name][0]:
            if word in text:
                return True
        for word1 in dic_type_kw[type_name][1]:
            if word1 in text:
                for word2 in dic_type_kw[type_name][2]:
                    if word2 in text:
                        return True
        return False
    if type_name == 'PINS_scl':
        for word1 in dic_type_kw[type_name][0]:
            if word1 in text:
                for word2 in dic_type_kw[type_name][1]:
                    if word2 in text:
                        return True
        return False
    else:
        for word in dic_type_kw[type_name][0]:
            if word in text:
                return True
        return False

In [None]:
# expected doc(source) for each type
dic_type_expected_doc = {}
for type_name in dic_type_kw:
    expected_docs = []
    for source in dic_doc_text:
        if jugde_expected(type_name,dic_doc_text[source]):
            expected_docs.append(source)
    dic_type_expected_doc[type_name] = expected_docs
    print('nb of expected docs for ',type_name,':',len(expected_docs))

# TF-IDF
Use TF-IDF to find top k candidates for each sample and compare with results of key word search

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list(dic_doc_text_sample.values())+list(dic_doc_text.values()))
vectors = X.toarray()
vector_sample = vectors[:len(list(dic_doc_text_sample.values()))]
vector_base = vectors[len(list(dic_doc_text_sample.values())):]

In [None]:
# for each sample, find top nb candidates from database
nb = 100

sim = np.matmul(vector_sample,vector_base.T)
candidates = torch.topk(torch.tensor(sim), k=nb, dim=1, sorted=True).indices
pred_cands = [[list(dic_doc_text.keys())[idx] for idx in candidate] for candidate in candidates]

In [None]:
dic_sample_cands = {}
for i in range(len(list(dic_doc_text_sample.keys()))):
    dic_sample_cands[list(dic_doc_text_sample.keys())[i]] = pred_cands[i]

# Evaluation

In [None]:
for label in labels_to_keep:
    samples_label = dic_label_sample[label]
    accs = []
    for sample in samples_label:
        acc = len([x for x in dic_sample_cands[sample] if x in dic_type_expected_doc[label]])/len(dic_sample_cands[sample])
        accs.append(acc)
    print('average accuracy for label ',label,': ',np.mean(accs))