In [None]:
import pandas as pd
import numpy as np
import tqdm
import torch
import ast
import os
import pickle
import shutil
import random

from nltk.corpus import stopwords

In [None]:
# read the whole database after filtering documents
df_base = pd.read_csv('../resources/database_doc_type.csv')
df_base.head()

In [None]:
df_base_new = df_base.drop_duplicates(subset=['observation_blob'])
df_base_new.reset_index(inplace=True)
df_base_new.drop(columns=['index','Unnamed: 0'],inplace=True)
df_base_new.reset_index(inplace=True)
df_base_new.rename(columns={'index':'source'},inplace=True)
df_base_new

In [None]:
# set the nb of letters to consider for cim10 code
nb_letter = 4

In [None]:
# dic where keys are sources and values are patient_nums
dic_doc_patient = {}
# dic where keys are sources and values are texts of this doc
dic_doc_text = {}
# dic where keys are sources and values are cim10s of this doc (only first 4 letters of cim10)
dic_doc_cim10 = {}
for i in tqdm.tqdm(range(len(df_base_new))):
    source = df_base_new.loc[i,'source']
    patient = df_base_new.loc[i,'patient_num']
    dic_doc_patient[source] = patient
    text = df_base_new.loc[i,'observation_blob']
    # put docs in lower case and replace '-',\n,\t by space ' '
    dic_doc_text[source] = str(text).lower().replace('-',' ').replace('\n',' ').replace('\t',' ')
    cim10s = [x.split(':')[-1][:nb_letter] for x in ast.literal_eval(df_base_new.loc[i,'list_cim10'])]
    dic_doc_cim10[source] = cim10s

In [None]:
# dic where keys are patient_nums and values are union of cim10s for this patient
# only first 4 letters of cim10
dic_patient_cim10 = {}
for patient in tqdm.tqdm(list(set(df_base_new['patient_num']))):
    cim10s = [ast.literal_eval(x) for x in list(df_base[df_base['patient_num']==patient]['list_cim10'])]
    cim10s = list(set([x.split(':')[-1][:nb_letter] for y in cim10s for x in y]))
    dic_patient_cim10[patient] = cim10s

# Key word extraction (to be compared with)

In [None]:
# nephro_lupus or nephro+lupus
nephro_lupus = ['néphropathie lupique', 'Néphropathie lupique', 'glomérulonéphrite lupique', 'Glomérulonéphrite lupique',"Lupus avec atteinte rénale", "lupus avec atteinte rénale", 
                "atteinte rénale du lupus", "Atteinte rénale du lupus", 'Insuffisance rénale secondaire au lupus',
                'insuffisance rénale secondaire au lupus','Glomérulopathie lupique', 'glomérulopathie lupique',
                'gn lupique', 'GN lupique', 'atteinte rénale classe IV','Atteinte rénale classe IV', 'atteinte rénale classe V',
                'Atteinte rénale classe V', 'Atteinte rénale classe III', 'Atteinte rénale classe VI',
                'atteinte rénale classe III', 'atteinte rénale classe VI', 'glomérulonéphrite extra membraneuse classe V',
               'Glomérulonéphrite extra membraneuse classe V']
nephro = ['glomérulonéphrite','Glométulonéphrite', 'insuffisance rénale chronique', 'Insuffisance rénale chronique', 'Maladie rénale chronique', 'maladie rénale chronique', 'GEM', 'HSF', 'hyalinose segmentaire et focale','Hyalinose segmentaire et focale', 'atteinte rénale', 'Atteinte rénale']
lupus = ['lupus', 'Lupus']

osteoporose = ['ostéoporose', 'Ostéoporose', 'ostéoporotique']

pneumopathie_infectieuse = ["pneumopathie d'inhalation","Pneumopathie d'inhalation" ,'pneumopathie à', 'Pneumopathie à','légionellose', 'Légionellose', 'infection pulmonaire','Infection pulmonaire', 'Pneumopathie infectieuse', 'pneumopathie infectieuse', 'pneumopathie acquise sous ventilation mécanique','Pneumopathie acquise sous ventilation mécanique','PAVM', 'pneumonie','Pneumonie', 'pneumopathie bilatérale','Pneumopathie bilatérale', 'pneumopathie basale',"Pneumopathie basale", 'pneumopathie bi-basale', 'Pneumopathie bi-basale', 'pneumopathie de la base', 'Pneumopathie de la base', 'PFLA', 'pneumopathie franche lobaire aiguë', 'Pneumopathie franche lobaire aiguë','pneumopathie communautaire','Pneumopathie communautaire','pneumopathie aiguë','Pneumopathie documentée', 'pneumopathie documentée', 'pneumopathie acquise sous ventilation mécanique', 'sepsis à point de départ pulmonaire','Sepsis à point de départ pulmonaire', 'choc septique à point de départ pulmonaire','Choc septique à point de départ pulmonaire', 'pneumopathie lobaire supérieure','Pneumopathie lobaire supérieure','pneumopathie nécrosante', 'Pneumopathie nécrosante','Broncho-pneumopathie','broncho-pneumopathie', 'bronchopneumopathie', "Bronchopneumopathie"]

# PINS+scl
PINS = ['atteinte pulmonaire interstitielle', 'pneumopathie interstitielle', 'syndrome interstitiel', 'atteinte pulmonaire', 'PID', 'PINS', 'fibrose pulmonaire', 'fibrose interstitielle', 'FID', 'atteinte interstitielle', 'pneumopathie fibrosante']
scl = ['sclérodermie systémique','Sclérodermie', 'Scc', 'sclérodermie cutanée diffuse', 'sclérodermie cutanée limitée', 'syndrome CREST', 'CREST', 'sclerodermie']


Hypertension_pulmonaire = ['Hypertension pulmonaire', 'Hypertension artérielle pulmonaire', "hypertension pulmonaire", "HTAP", "hypertension artérielle pulmonaire", "PAPs élevée"]

RGO = [" RGO ", "\nRGO ", " RGO\n", "reflux gastro oesophagien", "Reflux gastrique", "reflux gastro-oesophagien", "reflux oesophagien", 'reflux gastrique', 'Reflux gastro-oesophagien', 'Reflux oesophagien']

SGS = [" SGS ","\nSGS "," SGS\n" , "Syndrome de Gougerot Sjogren", "Gougerot", "Sjögren", "sjogren",'gougerot', "Syndrome de Gougerot Sjögren"]

EP = [" EP ", "\nEP ", " EP\n", "Embolie pulmonaire", 'embolie pulmonaire']

In [None]:
# put all in lower case
nephro_lupus = [x.lower() for x in nephro_lupus]
nephro = [x.lower() for x in nephro]
lupus = [x.lower() for x in lupus]
osteoporose = [x.lower() for x in osteoporose]
pneumopathie_infectieuse = [x.lower() for x in pneumopathie_infectieuse]
PINS = [x.lower() for x in PINS]
scl = [x.lower() for x in scl]
Hypertension_pulmonaire = [x.lower() for x in Hypertension_pulmonaire]
RGO = [x.lower() for x in RGO]
SGS = [x.lower() for x in SGS]
EP = [x.lower() for x in EP]

In [None]:
dic_type_kw = {
    'nephro_lupus':[nephro_lupus, nephro, lupus],
    'osteoporose':[osteoporose],
    'pneumopathie_infectieuse':[pneumopathie_infectieuse],
    'PINS_scl':[PINS,scl],
    'Hypertension_pulmonaire':[Hypertension_pulmonaire],
    'RGO':[RGO],
    'SGS':[SGS],
    'EP':[EP]
}

In [None]:
def jugde_expected(type_name, text):
    text = str(text).lower()
    if type_name == 'nephro_lupus':
        for word in dic_type_kw[type_name][0]:
            if word in text:
                return True
        for word1 in dic_type_kw[type_name][1]:
            if word1 in text:
                for word2 in dic_type_kw[type_name][2]:
                    if word2 in text:
                        return True
        return False
    if type_name == 'PINS_scl':
        for word1 in dic_type_kw[type_name][0]:
            if word1 in text:
                for word2 in dic_type_kw[type_name][1]:
                    if word2 in text:
                        return True
        return False
    else:
        for word in dic_type_kw[type_name][0]:
            if word in text:
                return True
        return False

In [None]:
# expected doc(source) for each type
dic_type_expected_doc = {}
for type_name in tqdm.tqdm(dic_type_kw):
    expected_docs = []
    for source in dic_doc_text:
        if jugde_expected(type_name,dic_doc_text[source]):
            expected_docs.append(source)
    dic_type_expected_doc[type_name] = expected_docs
    print('nb of expected docs for ',type_name,':',len(expected_docs))

In [None]:
# expected patients(patient_num) for each type
dic_type_expected_patients = {}
for type_name in dic_type_expected_doc:
    dic_type_expected_patients[type_name] = list(set([dic_doc_patient[x] for x in dic_type_expected_doc[type_name]]))
    print('nb of expected patients for ',type_name,':',len(dic_type_expected_patients[type_name]))

# Word search

In [None]:
# use CODER to find synonyms of all queries
open_file = open('dic_type_query_synonym_limit0.8_same_cui.pkl', "rb")
dic_type_query = pickle.load(open_file)
open_file.close()
dic_type_query

In [None]:
# use CODER to find synonyms of all queries
open_file = open('dic_type_query_phenotype_synonym_limit0.8_same_cui.pkl', "rb")
dic_type_query = pickle.load(open_file)
open_file.close()
dic_type_query

In [None]:
# for each query use word search to find all cim10 terms containing at least one complete synonym 
# docs = list(dic_doc_text.values())
# candidate terms for each query
dic_type_cands_term = {}
for type_name in dic_type_query:
    cands_type = []
    for i in range(len(dic_type_query[type_name])):
        # les docs qui contiennent la chaîne de caractère entière
        cands = [x for x in dic_doc_text if len([y for y in dic_type_query[type_name][i] if y in dic_doc_text[x]])>0]
        cands_type.append(list(set(cands)))
        print('nb of docs found for query ',dic_type_query[type_name][i][0],':',len(cands))
    cands_res = list(set.intersection(*map(set,cands_type)))
    dic_type_cands_term[type_name] = cands_res
    print('nb of docs found for label ', type_name, ':',len(cands_res))

In [None]:
# find patients whose docs contain at least one DP/DAS within candidates for each type
dic_type_patient_res = {}
for type_name in dic_type_cands_term:
    dic_type_patient_res[type_name] = list(set([dic_doc_patient[x] for x in dic_type_cands_term[type_name]]))
    print('nb of patients found for ',type_name,':',len(dic_type_patient_res[type_name]))

# Evaluation (only for 8 types)

In [None]:
# eval doc
for type_name in dic_type_cands_term:
    try:
        acc = len([x for x in dic_type_cands_term[type_name] if x in dic_type_expected_doc[type_name]])/len(dic_type_cands_term[type_name])
        print('doc accuracy for ',type_name,':',acc)
    except:
        print('no doc found for ',type_name)

In [None]:
# eval patient
for type_name in dic_type_patient_res:
    try:
        acc = len([x for x in dic_type_patient_res[type_name] if x in dic_type_expected_patients[type_name]])/len(dic_type_patient_res[type_name])
        print('patient accuracy for ',type_name,':',acc)
    except:
        print('no patient found for ',type_name)

In [None]:
dic_type_cim10 = {
    'nephro_lupus':[['N03', 'N04', 'N05'],['M320', 'M321', 'M328', 'M329', 'L930', 'L931']],
    'osteoporose':[['M80', 'M81']],
    'pneumopathie_infectieuse':[['J12', 'J13', 'J14', 'J15', 'J17', 'J18']],
    'PINS_scl':[['J84'],['M340', 'M341', 'M348', 'M349']],
    'Hypertension_pulmonaire':[['I270', 'I272']],
    'RGO':[['K21']],
    'SGS':[['M350']],
    'EP':[['I26']]
}

In [None]:
# find all docs containing at least one cim10 for each phenotype
dic_type_expected_cim10 = {}
for type_name in dic_type_cim10:
    cim10_lists = dic_type_cim10[type_name]
    docs = []
    for cim10_list in cim10_lists:
        docs.append([x for x in dic_doc_cim10 if len([y for y in cim10_list if y in [z[:len(y)] for z in dic_doc_cim10[x]]])>0])
    docs_res = list(set.intersection(*map(set,docs)))
    dic_type_expected_cim10[type_name] = docs_res
    print('nb of docs expected for phenotype ',type_name,':',len(docs_res))

In [None]:
# find all patients containing at least one cim10 for each phenotype
dic_type_expected_patients_cim10 = {}
for type_name in dic_type_expected_cim10:
    dic_type_expected_patients_cim10[type_name] = list(set([dic_doc_patient[x] for x in dic_type_expected_cim10[type_name]]))
    print('nb of patients found for ',type_name,':',len(dic_type_expected_patients_cim10[type_name]))

In [None]:
# eval doc (recall: how many expected docs are found by our method)
for type_name in dic_type_expected_cim10:
    try:
        acc = len([x for x in dic_type_expected_cim10[type_name] if x in dic_type_cands_term[type_name]])/len(dic_type_expected_cim10[type_name])
        print('doc recall for ',type_name,':',acc)
    except:
        print('no expected doc for ',type_name)

In [None]:
# eval patient (recall: how many expected patients are found by our method)
for type_name in dic_type_expected_patients_cim10:
    try:
        acc = len([x for x in dic_type_expected_patients_cim10[type_name] if x in dic_type_patient_res[type_name]])/len(dic_type_expected_patients_cim10[type_name])
        print('patient recall for ',type_name,':',acc)
    except:
        print('no expected patient for ',type_name)

# cim10 eval (for 20 phenotypes)

In [None]:
dic_pheno_cim10 = {
    '0':[[]],
    '1':[['M340', 'M341', 'M348', 'M349']],
    '2':[['I64', 'I63', 'Z86']],
    '3':[['I010', 'I092', 'I30', 'I32'],['M320', 'M321', 'M328', 'M329', 'L930', 'L931']],
    '4':[['M314']],
    '5':[['M05','M06']],
    '6':[['I64', 'I63', 'Z86'],['M320', 'M321', 'M328', 'M329', 'L930', 'L931']],
    '7':[['B20','B24','Z21']],
    '8':[['A15', 'A16']],
    '9':[['N10', 'N136', 'N209']],
    '10':[['E10', 'E11', 'E12', 'E13', 'E14']],
    '11':[['Z940']],
    '12':[['K754']],
    '13':[['O03']],
    '14':[['I21']],
    '15':[['I730']],
    '16':[['M330', 'M331']],
    '17':[['C61']],
    '18':[['N17']],
    '19':[['D693']]
}

In [None]:
# find all docs containing at least one cim10 for each phenotype
dic_phenotype_expected = {}
for type_name in dic_pheno_cim10:
    cim10_lists = dic_pheno_cim10[type_name]
    docs = []
    for cim10_list in cim10_lists:
        docs.append([x for x in dic_doc_cim10 if len([y for y in cim10_list if y in [z[:len(y)] for z in dic_doc_cim10[x]]])>0])
    docs_res = list(set.intersection(*map(set,docs)))
    dic_phenotype_expected[type_name] = docs_res
    print('nb of docs expected for phenotype ',type_name,':',len(docs_res))

In [None]:
# find all patients containing at least one cim10 for each phenotype
dic_phenotype_expected_patients = {}
for type_name in dic_phenotype_expected:
    dic_phenotype_expected_patients[type_name] = list(set([dic_doc_patient[x] for x in dic_phenotype_expected[type_name]]))
    print('nb of patients found for ',type_name,':',len(dic_phenotype_expected_patients[type_name]))

In [None]:
# eval doc (recall: how many expected docs are found by our method)
for type_name in dic_phenotype_expected:
    try:
        acc = len([x for x in dic_phenotype_expected[type_name] if x in dic_type_cands_term[type_name]])/len(dic_phenotype_expected[type_name])
        print('doc recall for ',type_name,':',acc)
    except:
        print('no expected doc for ',type_name)

In [None]:
# eval patient (recall: how many expected patients are found by our method)
for type_name in dic_phenotype_expected_patients:
    try:
        acc = len([x for x in dic_phenotype_expected_patients[type_name] if x in dic_type_patient_res[type_name]])/len(dic_phenotype_expected_patients[type_name])
        print('patient recall for ',type_name,':',acc)
    except:
        print('no expected patient for ',type_name)

In [None]:
# write docs found by cim10 but not by our method
types_to_write = ['6','8','13','17']
dic_doc_cim10_not_found = {}
for type_name in types_to_write:
    docs_to_write = []
    os.mkdir("./resources/docs_not_found/"+type_name)
    for doc in dic_phenotype_expected[type_name]:
        if doc not in dic_type_cands_term[type_name]:
            docs_to_write.append(doc)
            dic_doc_cim10_not_found[doc] = dic_doc_cim10[doc]

    for doc in docs_to_write:
        text = dic_doc_text[doc]
        f = open("./resources/docs_not_found/"+type_name+'/'+str(doc)+'.txt',"w")
        f.write(text)
        f.close()

In [None]:
open_file = open('dic_doc_cim10_not_found.pkl', "wb")
pickle.dump(dic_doc_cim10_not_found, open_file)
open_file.close()