In [1]:
!pip install simple-icd-10-cm



In [2]:
OUTPUT_DIR = "./../output/datasets/"
DATA_DIR = "./../data/"

In [3]:
import simple_icd_10_cm as cm
import pandas as pd
import numpy as np
import json
import random

In [4]:
diagnoses = pd.read_csv(DATA_DIR + "d_icd_diagnoses.csv.gz", compression="gzip")
diagnoses = diagnoses[diagnoses["icd_version"] == 10]

cod2ents = {}
desc2cod = {}
cod2desc = {}

for index, row in diagnoses.iterrows():
    cod2ents[row["icd_code"]] = set()
    desc2cod[row["long_title"]] = row["icd_code"]
    cod2desc[row["icd_code"]] = row["long_title"]




pp = []
data = {}
with open(OUTPUT_DIR + "pairs.jsonl", 'r') as f:
    for line in f:
        line = json.loads(line)
        id = line["note_id"]
        pairs = line["pairs"]
        pairsList = []
        for pair in pairs:
            if pair["label"] in desc2cod:
                cod2ents[desc2cod[pair["label"]]].add(pair["term"].upper())
                pp.append((pair["term"], desc2cod[pair["label"]]))
                pairsList.append((pair["term"], desc2cod[pair["label"]]))
        if id not in data:
            data[id] = []
        data[id] += pairsList

pairs = pp

In [5]:
# a partire dal dictionary data crea un dataframe di pandas, senza usare il metodo append

dd = []

for k, v in data.items():
    dd.append((k, v, list(set([touple[1] for touple in v]))))

df = pd.DataFrame(dd, columns=["note_id", "pairs", "anchoredCodes"])

In [6]:
notes = pd.read_feather(DATA_DIR + "mimiciv_icd10.feather")

In [7]:
df = pd.merge(df, notes, on="note_id", how="left")

28805

In [12]:
missingCodesPerNote = 0
totalCodes = set()
totalAnchoredCodes = set()
totalMissingCodes = set()
for index, row in df.iterrows():
    codes = set([x.replace(".","") for x in row["icd10_diag"]])
    anchoredCodes = set(row["anchoredCodes"])
    #anchoredCodes = anchoredCodes.intersection(codes)
    missingCodes = codes - anchoredCodes
    missingCodesPerNote += len(missingCodes)
    totalCodes = totalCodes.union(codes)
    totalAnchoredCodes = totalAnchoredCodes.union(anchoredCodes)

missingCodesPerNote /= len(df)
totalMissingCodes = totalCodes - totalAnchoredCodes
print("Missing codes per note: ", missingCodesPerNote)
print("Total missing codes: ", len(totalMissingCodes))
print("Total codes: ", len(totalCodes))
print("Total anchored codes: ", len(totalAnchoredCodes))

    

Missing codes per note:  5.587593879568559
Total missing codes:  129
Total codes:  5799
Total anchored codes:  5959


In [17]:
t = [cod2desc[x] for x in (totalAnchoredCodes - totalCodes)]

In [18]:
t

['Diffuse large B-cell lymphoma',
 'Alcoholic liver disease',
 'Emphysema',
 'Diseases of the digestive system complicating pregnancy',
 'Nicotine dependence',
 'Dysphagia',
 'Exotropia',
 'Acute respiratory failure',
 'Diaphragmatic hernia',
 'Myositis',
 'Chronic kidney disease (CKD)',
 'Preterm labor second trimester with preterm delivery second trimester',
 'Osteophyte',
 'Malignant neoplasm of retroperitoneum and peritoneum',
 'Epilepsy, unspecified, intractable',
 'Brucellosis',
 'Major depressive disorder, recurrent',
 'Malignant neoplasm of bronchus and lung',
 'Vitamin A deficiency',
 'Traumatic pneumothorax',
 'Irritable bowel syndrome',
 'Bilateral inguinal hernia, without obstruction or gangrene',
 'Osteonecrosis',
 'Fall from, out of or through roof',
 'Alcoholic cirrhosis of liver',
 'Periprosthetic fracture around internal prosthetic left knee joint, initial encounter',
 'Hypotension',
 'Abdominal tenderness',
 'Scleritis',
 'Presence of artificial knee joint',
 'Hepatic

In [None]:
totalMissingTitles = [cod2desc[x] for x in totalMissingCodes]

In [11]:
totalMissingTitles

['Overweight',
 'Contusion of oral cavity, initial encounter',
 'Ankylosis, left hip',
 'Adverse effect of inhaled anesthetics, initial encounter',
 'von Gierke disease',
 'Cell of prison as the place of occurrence of the external cause',
 'Garden or yard in single-family (private) house as the place of occurrence of the external cause',
 'Underdosing of other antacids and anti-gastric-secretion drugs, initial encounter',
 'Other abnormal findings in urine',
 'Kitchen of single-family (private) house as the place of occurrence of the external cause',
 'Underdosing of rifampicins, initial encounter',
 'Blister (nonthermal), left lower leg, initial encounter',
 'Abrasion of right forearm, initial encounter',
 'Melanocytic nevi of trunk',
 'Dependent relative needing care at home',
 'Periprosthetic fracture around internal prosthetic right hip joint, initial encounter',
 'Straining to void',
 'Coma scale, best verbal response, confused conversation, at arrival to emergency department',
 '

In [31]:
tot = list(cod2desc.keys())

In [194]:
def get_cousines(code):
    parent = cm.get_parent(code)
    grandpa = cm.get_parent(parent)
    cousines = [cm.get_children(x) for x in cm.get_children(grandpa) if x != parent]
    cousines = [item for sublist in cousines for item in sublist]
    cousines = [x for x in cousines if cm.is_leaf(x)]
    cousines = [x.replace(".", "") for x in cousines]
    random.shuffle(cousines)
    return cousines


In [38]:
import re

In [47]:

orderedCodes = []
with open(DATA_DIR + "2020order.txt", 'r') as f:
    for line in f:
        splitted = line.split()
        if splitted[2] == "1":
            orderedCodes.append(splitted[1])

In [180]:
def read_range(range):
    equivalentCodes = []
    splitted = range.split("-")
    splitted = [x.replace(" ", "") for x in splitted]
    if len(splitted) == 2 and len(splitted[0]) != 0 and len(splitted[1]) != 0:
        a = orderedCodes.index([x for x in orderedCodes if x.startswith(splitted[0].replace(".", ""))][0])
        b = orderedCodes.index([x for x in orderedCodes if x.startswith(splitted[1].replace(".", ""))][-1])
        equivalentCodes.extend(orderedCodes[a:b+1])
    return equivalentCodes

def read_family(family):
    equivalentCodes = []
    splitted = family.split("-")
    splitted = [x.replace(" ", "") for x in splitted]
    if len(splitted) == 2 and len(splitted[0]) != 0 and len(splitted[1]) == 0:
        equivalentCodes.extend([x.replace(".","") for x in cm.get_descendants(splitted[0].replace(".",""))])
    return [x.replace(".","") for x in equivalentCodes]

def read_code(code):
    splitted = code.split("-")
    splitted = [x.replace(" ", "") for x in splitted]
    equivalentCodes = []
    if len(splitted) == 1:
        descendant = [x.replace(".","") for x in cm.get_descendants(splitted[0].replace(".",""))]
        if len(descendant) > 0:
            equivalentCodes.extend(descendant)
        else:
            equivalentCodes.append(splitted[0].replace(".",""))
    return [x.replace(".","") for x in equivalentCodes]

def read_range_family_code(s):
    rangePattern = r"[A-Z][A-Z0-9]*(\.[A-Z0-9]+)?-[A-Z][A-Z0-9]*(\.[A-Z0-9]+)?"
    familyPattern = r"[A-Z][A-Z0-9]*(\.)?([A-Z0-9]+)?-"
    codePattern = r"^[^-]*$"
    equivalentCodes = []

    if re.match(rangePattern, s):
        equivalentCodes = read_range(s)
    elif re.match(familyPattern, s):
        equivalentCodes = read_family(s)
    elif re.match(codePattern, s):
        equivalentCodes = read_code(s)
    
    return [x.replace(".","") for x in equivalentCodes]
    


def read_with(w):
    equivalentCodes = []
    if "with" in w:
        splitted = w.split("with")
        prefixcodes = splitted[0].split(",")
        prefixcodes = [x.replace(" ", "") for x in prefixcodes]
        suffixcodes = splitted[1].split(",")
        suffixcodes = [x.replace(" ", "").replace(".", "") for x in suffixcodes]

        pre = []
        for p in prefixcodes:
            pre.extend(read_range_family_code(p))
        
        for p in pre:
            p2 = p[3:]
            for s in suffixcodes:
                if p2.startswith(s):
                    equivalentCodes.append(p)
    return [x.replace(".","") for x in equivalentCodes]


        
def get_excludes(code):
    equivalentCodes = []
    if cm.is_valid_item(code):
        negatives = set(cm.get_excludes1(code) + cm.get_excludes2(code))
        for c in negatives:
            if not cm.is_valid_item(c):
                if not c.endswith(")"):
                    continue
                else:
                    match = re.search(r'\(([^()]*)\)(?!.*\([^()]*\))',  c)
                    if match:
                        extracted_codes = [match.group(1)]
                    else:
                        extracted_codes = []
                    if len(extracted_codes) > 0:
                        extracted_code = extracted_codes[0]
                        if "with final characters" in extracted_code:
                            equivalentCodes.extend([])
                        if "with" in extracted_code:
                            equivalentCodes.extend(read_with(extracted_code))
                        else:
                            splitted = extracted_code.split(",")
                            splitted = [x.replace(" ", "") for x in splitted]
                            for s in splitted:
                                if s != "":
                                    equivalentCodes.extend(read_range_family_code(s))
            else:
                equivalentCodes.append(c.replace(".",""))
    return [x.replace(".","") for x in equivalentCodes]



        

In [198]:
cod2desc["U071"]

KeyError: 'U071'

In [205]:
negatives = {}
for code in orderedCodes:
    neg = set()
    if cm.is_valid_item(code):
        if len(cm.get_excludes1(code) + cm.get_excludes2(code)) > 0:
            neg.update(get_excludes(code))
        neg.update(get_cousines(code))
    
    toups = []
    for n in neg:
        if n in cod2desc:
            toups.append((n, 0))
    
    random.shuffle(toups)
    #toups = [(cod2desc[x], 0) for x in neg if x in cod2desc] 
    negatives[code] = toups


    

In [196]:
# conta il numero medio di elementi per ogni lista di esclusi
total = 0
for k, v in negatives.items():
    total += len(v)
total /= len(negatives)
print("Average number of elements per list of excludes: ", total)

# conta quanti elementi hanno zero esclusi
total = 0
for k, v in negatives.items():
    if len(v) == 0:
        total += 1
print("Number of elements with zero excludes: ", total)


Average number of elements per list of excludes:  24.012772913665078
Number of elements with zero excludes:  1718


In [207]:
neg = [("come", 1), ("ciao", 0), ("stai", 2)]
neg = sorted(neg, key=lambda x: x[1])
print(neg)

[('ciao', 0), ('come', 1), ('stai', 2)]


In [214]:
touples = []

for term, code in pairs:
    upperTerm = term.upper()
    neg = []
    hn = []
    rn = []
    if code in negatives:
        neg = negatives[code]
    if len(neg) > 0:
        neg = sorted(neg, key=lambda x: x[1])
        size = min(5, len(neg))
        i = 0
        while len(hn) < size and i < len(neg):
            if not upperTerm in cod2ents[neg[i][0]]:
                hn.append(neg[i])
                neg[i] = (neg[i][0], neg[i][1] + 1)
            i += 1
    negatives[code] = neg

    nrand = 10 - len(hn)
    while len(rn) < nrand:
        rand = random.choice(tot)
        if not upperTerm in cod2ents[rand]:
            rn.append((rand, 0))
    
    n = hn + rn
    touples.append((
            term, 
            cod2desc[code], 
            cod2desc[n[0][0]], 
            cod2desc[n[1][0]], 
            cod2desc[n[2][0]], 
            cod2desc[n[3][0]], 
            cod2desc[n[4][0]],
            cod2desc[n[5][0]],
            cod2desc[n[6][0]],
            cod2desc[n[7][0]],
            cod2desc[n[8][0]],
            cod2desc[n[9][0]]
        ))
    
        

            
        



    
    
      

In [215]:
touples

[('Anterior exenteration',
  'Malignant neoplasm of bladder neck',
  'Malignant neoplasm of left ureter',
  'Malignant neoplasm of left renal pelvis',
  'Malignant neoplasm of overlapping sites of urinary organs',
  'Malignant neoplasm of urinary organ, unspecified',
  'Malignant neoplasm of urethra',
  'Nondisplaced fracture of head of right radius, initial encounter for open fracture type IIIA, IIIB, or IIIC',
  'Other otitis externa, right ear',
  'Unspecified injury of unspecified elbow, sequela',
  'Displaced oblique fracture of shaft of unspecified tibia, subsequent encounter for open fracture type IIIA, IIIB, or IIIC with delayed healing',
  'Toxic effect of tetrachloroethylene, intentional self-harm, initial encounter'),
 ('Atorvastatin',
  'Hyperlipidemia, unspecified',
  'Hypovolemia',
  'Classical phenylketonuria',
  'Defects in post-translational modification of lysosomal enzymes',
  'Cystic fibrosis with other manifestations',
  'Organ-limited amyloidosis',
  'Unspecified 

In [7]:
touples = []
for term, code in pairs:
    upperTerm = term.upper()
    if cm.is_valid_item(code):
        similars = [x for x in cm.get_excludes1(code) + cm.get_excludes2(code) + get_cousines(code) if x.replace(".", "") != code and cm.is_valid_item(x) and cm.is_leaf(x)]
        hardNegatives = [x for x in similars if x.replace(".", "") in cod2ents and upperTerm not in cod2ents[x.replace(".", "")]]
        hardNegatives = [x.replace(".", "") for x in hardNegatives][:5]

        numDistractors = 10 - len(hardNegatives)
        distractors = []
        while len(distractors) < numDistractors:
            distractors = [x for x in random.sample(tot, numDistractors) if x != code and x not in hardNegatives]
            distractors = [x for x in distractors if x.replace(".", "") in cod2ents and upperTerm not in cod2ents[x.replace(".", "")]]
        hardNegatives += distractors
        touples.append((
            term, 
            cod2desc[code], 
            cod2desc[hardNegatives[0]], 
            cod2desc[hardNegatives[1]], 
            cod2desc[hardNegatives[2]], 
            cod2desc[hardNegatives[3]], 
            cod2desc[hardNegatives[4]],
            cod2desc[hardNegatives[5]],
            cod2desc[hardNegatives[6]],
            cod2desc[hardNegatives[7]],
            cod2desc[hardNegatives[8]],
            cod2desc[hardNegatives[9]]
            ))

In [278]:
prova = random.sample(touples, 1)
for term, code, neg1, neg2, neg3, neg4, neg5, neg6, neg7, neg8, neg9, neg10 in prova:
    print(term)
    print(code)
    print(neg1)
    print(neg2)
    print(neg3)
    print(neg4)
    print(neg5)
    print(neg6)
    print(neg7)
    print(neg8)
    print(neg9)
    print(neg10)
    print()

Ileal loop
Fistula of intestine
Hemorrhage of anus and rectum
Inflammatory polyps of colon with fistula
Chronic vascular disorders of intestine
Volvulus
Rectal prolapse
Chronic embolism and thrombosis of unspecified vein
Unspecified fracture of lower end of right femur, subsequent encounter for closed fracture with routine healing
Poisoning by unspecified drugs, medicaments and biological substances, undetermined, initial encounter
Heat exposure on board other powered watercraft, initial encounter
Toxic effect of unspecified inorganic substance, accidental (unintentional)



In [148]:
# prendi 200 tuple casuali dalla lista e crea una stringa formattata come un file di testo nel quale in ogni riga si ha: anchor: tupla[0], positive: tupla[1], negative1: tupla[2], negative2: tupla[3], negative3: tupla[4], negative4: tupla[5], negative5: tupla[6]


touples = random.sample(touples, 200)
with open(OUTPUT_DIR + "triplets.txt", 'w') as f:
    for t in touples:
        f.write(f"anchor: {t[0]}, positive: {t[1]}, negative1: {t[2]}, negative2: {t[3]}, negative3: {t[4]}, negative4: {t[5]}, negative5: {t[6]}\n")

[('Anterior exenteration',
  'Malignant neoplasm of bladder neck',
  'Malignant neoplasm of right kidney, except renal pelvis',
  'Malignant neoplasm of left kidney, except renal pelvis',
  'Malignant neoplasm of unspecified kidney, except renal pelvis',
  'Malignant neoplasm of right renal pelvis',
  'Malignant neoplasm of left renal pelvis',
  'Cytomegaloviral pancreatitis',
  'Chronic gout due to renal impairment, right knee, with tophus (tophi)',
  'Blister (nonthermal) of right front wall of thorax, sequela',
  'Toxic reaction to local anesthesia during pregnancy, third trimester',
  'Unspecified car occupant injured in collision with sport utility vehicle in nontraffic accident, subsequent encounter'),
 ('Atorvastatin',
  'Hyperlipidemia, unspecified',
  'Classical phenylketonuria',
  'Other hyperphenylalaninemias',
  'Disorders of tryptophan metabolism',
  'Disorder of aromatic amino-acid metabolism, unspecified',
  'Maple-syrup-urine disease',
  'Adverse effect of thrombolytic 

In [170]:
prova = random.sample(touples, 1)
for term, code, neg1, neg2, neg3, neg4, neg5, neg6, neg7, neg8, neg9, neg10 in prova:
    print(term)
    print(code)
    print(neg1)
    print(neg2)
    print(neg3)
    print(neg4)
    print(neg5)
    print(neg6)
    print(neg7)
    print(neg8)
    print(neg9)
    print(neg10)
    print()



Myocarditis
Personal history of other diseases of the circulatory system
Personal history of tuberculosis
Personal history of gestational diabetes
Personal history of latent tuberculosis infection
Personal history of other infectious and parasitic diseases
Personal history of diabetic foot ulcer
Radiculopathy, lumbosacral region
Military operation involving other effects of nuclear weapons, military personnel
Contusion of right ring finger with damage to nail, subsequent encounter
Unspecified fracture of second metacarpal bone, right hand, initial encounter for open fracture
Hereditary and idiopathic neuropathy



In [171]:
# salva il dataset di triple come un dataframe di pandas con 3 colonne "anchor", "positive" e "negative"
df = pd.DataFrame(touples, columns=["anchor", "positive", "negative_1", "negative_2", "negative_3", "negative_4", "negative_5", "negative_6", "negative_7", "negative_8", "negative_9", "negative_10"])
df.to_csv(OUTPUT_DIR + "touples.csv", index=False)


In [172]:
df

Unnamed: 0,anchor,positive,negative_1,negative_2,negative_3,negative_4,negative_5,negative_6,negative_7,negative_8,negative_9,negative_10
0,Anterior exenteration,Malignant neoplasm of bladder neck,"Malignant neoplasm of unspecified kidney, exce...",Malignant neoplasm of unspecified renal pelvis,Malignant neoplasm of left ureter,Malignant neoplasm of paraurethral glands,Malignant neoplasm of left renal pelvis,"Unspecified kyphosis, cervical region",Unspecified superficial injuries of left back ...,Dome fracture of acetabulum,"Other fracture of left great toe, initial enco...",Unspecified fracture of upper end of unspecifi...
1,Atorvastatin,"Hyperlipidemia, unspecified",Other lactose intolerance,"Lipomatosis, not elsewhere classified","Mucopolysaccharidosis, type II",Hyperuricemia without signs of inflammatory ar...,"Volume depletion, unspecified","Glaucoma secondary to other eye disorders, uns...","Fracture of one rib, left side, subsequent enc...","Toxic effect of other tobacco and nicotine, ac...",Puncture wound without foreign body of left ri...,Nondisplaced fracture of epiphysis (separation...
2,Urostomy,Malignant neoplasm of bladder neck,"Malignant neoplasm of urinary organ, unspecified",Malignant neoplasm of overlapping sites of uri...,Malignant neoplasm of left ureter,Malignant neoplasm of urethra,Malignant neoplasm of left renal pelvis,Indeterminate leprosy,"Poisoning by other viral vaccines, accidental ...","Fracture of unspecified metatarsal bone(s), ri...",Sprain of tarsometatarsal ligament of unspecif...,Burn of first degree of multiple sites of left...
3,Bladder cancer,Malignant neoplasm of bladder neck,Malignant neoplasm of right renal pelvis,Malignant neoplasm of urethra,Malignant neoplasm of right ureter,"Malignant neoplasm of urinary organ, unspecified",Malignant neoplasm of left ureter,"Crepitant synovitis (acute) (chronic), unspeci...","Legal intervention, means unspecified, law enf...","Other paralytic strabismus, left eye",Contusion of right great toe with damage to na...,Displaced oblique fracture of shaft of left fe...
4,Losartan,Essential (primary) hypertension,Portal vein thrombosis,Cardiomyopathy in diseases classified elsewhere,Endocarditis and heart valve disorders in dise...,"Endocarditis, valve unspecified",Rheumatic fever without heart involvement,Complete traumatic metacarpophalangeal amputat...,Poisoning by other primarily systemic and hema...,Other specified injury of other blood vessels ...,Breakdown (mechanical) of internal fixation de...,Maternal care for unstable lie
...,...,...,...,...,...,...,...,...,...,...,...,...
473541,Lamivudine,Adverse effect of antineoplastic and immunosup...,Poisoning by antineoplastic and immunosuppress...,Poisoning by antineoplastic and immunosuppress...,Poisoning by antineoplastic and immunosuppress...,Poisoning by antineoplastic and immunosuppress...,Underdosing of antineoplastic and immunosuppre...,Pain associated with micturition,Other specified injury of blood vessel of righ...,"Laceration of extensor or abductor muscles, fa...","Toxic effect of cobra venom, assault","Obstructive and reflux uropathy, unspecified"
473542,Rituxan,Encounter for antineoplastic chemotherapy,Encounter for other specified aftercare,Encounter for therapeutic drug level monitoring,"Unspecified superficial injury of left thigh, ...",Nondisplaced fracture of triquetrum [cuneiform...,Diseases of the circulatory system complicatin...,Displaced fracture of olecranon process with i...,Legal intervention involving unspecified sharp...,"Displaced apophyseal fracture of right femur, ...",Displaced fracture of medial wall of unspecifi...,Displaced fracture of lateral end of left clav...
473543,MGUS,Monoclonal gammopathy,Neoplasm of uncertain behavior of prostate,Neoplasm of uncertain behavior of respiratory ...,Refractory cytopenia with multilineage dysplasia,Neoplasm of uncertain behavior of colon,Neoplasm of uncertain behavior of larynx,"Calcific tendinitis, multiple sites",Asphyxiation due to being trapped in a (discar...,Concussion with loss of consciousness greater ...,Poisoning by other primarily systemic and hema...,"Paper entering through skin, initial encounter"
473544,Aminotransferases,Nonspecific elevation of levels of transaminas...,Elevated erythrocyte sedimentation rate,Other abnormality of red blood cells,Other specified abnormalities of plasma proteins,"Abnormal finding of blood chemistry, unspecified",Finding of other drugs of addictive potential ...,"Burn of first degree of left axilla, sequela",Type 1 diabetes mellitus with diabetic macular...,Other specified injury of unspecified blood ve...,Fracture of radius or ulna following insertion...,"Other fracture of upper end of left tibia, sub..."
