In [100]:
import pandas as pd
import numpy as np
import random as rand
rand.seed(42)

In [101]:
mimicPath = "D:/datasets/MIMIC-IV/files/mimiciv/2.2/"
mimicNotePath = "D:/datasets/MIMIC-IV/files/mimic-iv-note/2.2/"

In [102]:
# MIMIC-IV tables
icd9 = pd.read_feather("D:\TESI\Split\mimiciv_icd9\mimiciv_icd9.feather")
icd10 = pd.read_feather("D:\TESI\Split\mimiciv_icd10\mimiciv_icd10.feather")

In [103]:
# ICD10 DIAGNOSIS CODES
d_icd_diagnoses = pd.read_csv(mimicPath + "hosp/d_icd_diagnoses.csv.gz", compression="gzip")
d_icd10_diagnoses = d_icd_diagnoses[d_icd_diagnoses.icd_version==10]

# ICD10 PROCEDURE CODES
d_icd_procedures = pd.read_csv(mimicPath + "hosp/d_icd_procedures.csv.gz", compression="gzip")
d_icd10_procedures = d_icd_procedures[d_icd_procedures.icd_version==10]

df_icd10_total = pd.concat([d_icd10_diagnoses, d_icd10_procedures], axis = 0)

Unnamed: 0,icd_code,icd_version,long_title
4,001,10,"Central Nervous System and Cranial Nerves, Bypass"
12,0016070,10,Bypass Cerebral Ventricle to Nasopharynx with ...
13,0016071,10,Bypass Cerebral Ventricle to Mastoid Sinus wit...
14,0016072,10,Bypass Cerebral Ventricle to Atrium with Autol...
15,0016073,10,Bypass Cerebral Ventricle to Blood Vessel with...
...,...,...,...
85252,XW0DXV5,10,Introduction of Gilteritinib Antineoplastic in...
85253,XXE,10,"New Technology, Physiological Systems, Measure..."
85254,XXE5XM5,10,"Measurement of Infection, Whole Blood Nucleic ..."
85255,XY0,10,"New Technology, Extracorporeal, Introduction"


In [124]:
d_icd10_diagnoses.sample(5)['long_title'].values

array(['Laceration of intrinsic muscle, fascia and tendon of right thumb at wrist and hand level, initial encounter',
       'Unspecified heterotropia', 'Transsexualism',
       'Unspecified injury of esophagus (thoracic part), initial encounter',
       'Unspecified focal traumatic brain injury with loss of consciousness greater than 24 hours without return to pre-existing conscious level with patient surviving, subsequent encounter'],
      dtype=object)

In [125]:
d_icd10_procedures.sample(5)['long_title'].values

array(['Male Reproductive System, Release',
       'Release Right Radius, Percutaneous Endoscopic Approach',
       'Bypass Left Saphenous Vein to Lower Vein with Autologous Tissue Substitute, Open Approach',
       'Fluoroscopy of Uterus using Other Contrast',
       'Repair Hard Palate, Percutaneous Approach'], dtype=object)

In [104]:

# DICTIONARY CODE -> DESCRIPTION
cod2lbl = {}
for key, value in zip(df_icd10_total.icd_code, df_icd10_total.long_title):
    cod2lbl[key]=value

# FUNCTION TO ASSIGN DESCRIPTION TO CODES
def assign_title(x):
    return [cod2lbl[el.replace('.', '')] for el in x]

In [105]:
icd10_split = pd.read_feather("D:\TESI\Split\mimiciv_icd10\mimiciv_icd10_split.feather")

In [106]:
# THE SPLITS
icd10_train = icd10_split[icd10_split['split'] == 'train']
icd10_val = icd10_split[icd10_split['split'] == 'val']
icd10_test = icd10_split[icd10_split['split'] == 'test']

In [107]:
icd10_train_df = icd10[icd10['_id'].isin(icd10_train['_id'])]

In [108]:
# RANDOM SAMPLE OF 10000 FOR TESTING
icd10_prova = icd10_train_df.sample(10000, random_state=42).reset_index(drop=True)

In [109]:
# REMOVE DOTS FROM CODES
for i in range(0, len(icd10_prova)):
    tmp = icd10_prova['target'][i]
    for j in range(0, len(tmp)):
        tmp[j] = tmp[j].replace('.', '')

In [110]:
# FUNCTION TO CREATE DISTRACTORS
def create_distractors(x):
    target_set = set(x)
    num_targets = len(target_set)
    target_list = list(x)
    num_distractors = 0
    while num_distractors == 0:
        num_distractors = round(rand.uniform(0.25, 2) * num_targets)
        
    available_codes = set(df_icd10_total["icd_code"]) - target_set
    available_codes_list = list(available_codes)
    distractors = rand.sample(available_codes_list, num_distractors)
        
    return distractors
    
    

In [111]:
# ADDING DISTRACTORS TO THE DATAFRAME
icd10_prova['distractors'] = icd10_prova['target'].apply(create_distractors)

In [112]:
# ASSERT THAT NO TARGET IS IN THE DISTRACTORS
for i in range(0, len(icd10_prova)):
    target_set = set(icd10_prova['target'][i])
    distractors_set = set(icd10_prova['distractors'][i])
    assert len(target_set.intersection(distractors_set)) == 0



In [113]:
# CLONING THE TARGET COLUMN IN THE RESPONSE COLUMN
icd10_prova["response"] = icd10_prova["target"]

In [114]:
# EXTRACTING 5% OF THE SAMPLE TO CREATE ROWS WITH ONLY DISTRACTORS
onlyDistractors = icd10_prova.sample(frac=0.05, random_state=42).reset_index(drop=True)

In [115]:

onlyDistractors['distractors'] = onlyDistractors['target'].apply(create_distractors)

# EMPTYING THE RESPONSE COLUMN
onlyDistractors["response"] = onlyDistractors["response"].apply(lambda x: [])

In [175]:
# CONCATENATING THE TWO DATAFRAMES AND SHAFFLING THE ROWS
final_df = pd.concat([icd10_prova, onlyDistractors], axis = 0)
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [119]:
final_df.to_csv("D:/TESI/Datasets/test_with_distractors.csv", index=False)

In [130]:
final_df.iloc[0]

note_id                                            15427594-DS-5
subject_id                                              15427594
_id                                                     21908688
note_type                                                     DS
note_seq                                                       5
charttime                                    2175-01-30 00:00:00
storetime                                    2175-01-30 15:51:00
text           name unit no admission date discharge date dat...
icd10_proc                           [02HV33Z, 03VG0CZ, 3E0G76Z]
icd10_diag     [E03.9, E87.2, G43.909, G91.9, I47.1, I60.11, ...
target         [02HV33Z, 03VG0CZ, 3E0G76Z, E039, E872, G43909...
num_words                                                   2095
num_targets                                                   17
distractors    [037N46Z, 03SD0ZZ, S22082S, 0RB63ZZ, L02222, 0...
response       [02HV33Z, 03VG0CZ, 3E0G76Z, E039, E872, G43909...
Name: 0, dtype: object

In [176]:
final_df.drop(columns=['note_id', 'subject_id', '_id', 'note_type', 'note_seq', 'charttime', 'storetime', 'icd10_proc', 'icd10_diag', 'target', 'num_words', 'num_targets'], inplace=True)

In [177]:
final_df

Unnamed: 0,text,distractors,response
0,name unit no admission date discharge date dat...,"[037N46Z, 03SD0ZZ, S22082S, 0RB63ZZ, L02222, 0...","[02HV33Z, 03VG0CZ, 3E0G76Z, E039, E872, G43909..."
1,name unit no admission date discharge date dat...,"[0P9C0ZZ, T457X4A, 4A003RD, O99512, F07Z8DZ, T...","[D631, E839, E872, E875, N179, N186, N319, N39..."
2,name unit no admission date discharge date dat...,"[T809, S20441, 0G9R30Z, 02RL08Z, 02WA0RS, 0NQ4...","[C91Z0, E1151, E7800, G4730, I129, I2510, I503..."
3,name unit no admission date discharge date dat...,"[M856, S63655A, F1212, 0TL67DZ, T411X5A, O694X...","[0D568ZZ, D62, E1122, E11319, E669, E7800, E87..."
4,name unit no admission date discharge date dat...,"[00FUXZZ, J122, S63209A, S62667P, T5091, T3452]","[0HBHXZZ, 0HRKX74, 0JBN0ZZ, I129, I4891, N182,..."
...,...,...,...
10495,name unit no admission date discharge date dat...,"[01N60ZZ, W241]","[D320, E039, E785, G8191, G935, G936, R4701]"
10496,name unit no admission date discharge date dat...,"[V8666XD, T1721, T288, T7808XA, 0TTD4ZZ]","[0QS704Z, 0SRS0JZ, F0390, S72002A, S72112A, W1..."
10497,name unit no admission date discharge date dat...,"[S33120, S5640, F941]","[00160J6, 00P60JZ, E119, G919, I10, R251, Y831..."
10498,name unit no admission date discharge date dat...,"[Z3A25, D303, 7W04X7Z, 02CX0ZZ, B97, 0DVL0DZ, ...","[0QSG04Z, E669, E785, F17210, I10, M1711, R079..."


In [178]:
codes = []
for i in range(0, len(final_df)):
    tmp = list(final_df['response'][i]) + list(final_df['distractors'][i])
    rand.shuffle(tmp)
    codes.append(tmp)

final_df['codes'] = codes

In [179]:
final_df

Unnamed: 0,text,distractors,response,codes
0,name unit no admission date discharge date dat...,"[037N46Z, 03SD0ZZ, S22082S, 0RB63ZZ, L02222, 0...","[02HV33Z, 03VG0CZ, 3E0G76Z, E039, E872, G43909...","[02PA07Z, 3E0G76Z, F1495, 0RPVX0Z, 03SD0ZZ, T5..."
1,name unit no admission date discharge date dat...,"[0P9C0ZZ, T457X4A, 4A003RD, O99512, F07Z8DZ, T...","[D631, E839, E872, E875, N179, N186, N319, N39...","[0P9C0ZZ, W3183, 07QL3ZZ, N319, T8584, D631, 0..."
2,name unit no admission date discharge date dat...,"[T809, S20441, 0G9R30Z, 02RL08Z, 02WA0RS, 0NQ4...","[C91Z0, E1151, E7800, G4730, I129, I2510, I503...","[02WA0RS, I5030, 069D3ZX, I951, I2510, 0NQ40ZZ..."
3,name unit no admission date discharge date dat...,"[M856, S63655A, F1212, 0TL67DZ, T411X5A, O694X...","[0D568ZZ, D62, E1122, E11319, E669, E7800, E87...","[I739, E7800, E083543, T411X5A, 05P032Z, E1122..."
4,name unit no admission date discharge date dat...,"[00FUXZZ, J122, S63209A, S62667P, T5091, T3452]","[0HBHXZZ, 0HRKX74, 0JBN0ZZ, I129, I4891, N182,...","[J122, T3452, 0JBN0ZZ, T5091, Z7901, S62667P, ..."
...,...,...,...,...
10495,name unit no admission date discharge date dat...,"[01N60ZZ, W241]","[D320, E039, E785, G8191, G935, G936, R4701]","[W241, 01N60ZZ, R4701, D320, E785, G936, G8191..."
10496,name unit no admission date discharge date dat...,"[V8666XD, T1721, T288, T7808XA, 0TTD4ZZ]","[0QS704Z, 0SRS0JZ, F0390, S72002A, S72112A, W1...","[0TTD4ZZ, V8666XD, F0390, 0QS704Z, S72112A, S7..."
10497,name unit no admission date discharge date dat...,"[S33120, S5640, F941]","[00160J6, 00P60JZ, E119, G919, I10, R251, Y831...","[Y929, I10, 00160J6, S5640, E119, F941, Y831, ..."
10498,name unit no admission date discharge date dat...,"[Z3A25, D303, 7W04X7Z, 02CX0ZZ, B97, 0DVL0DZ, ...","[0QSG04Z, E669, E785, F17210, I10, M1711, R079...","[T23172A, B51L0ZZ, 0QSG04Z, I10, Z3A25, 0DVL0D..."


In [174]:
final_df

Unnamed: 0,text,response,codes
0,name unit no admission date discharge date dat...,[Insertion of Infusion Device into Superior Ve...,"[M84312P, 0QHJ3BZ, E039, S24151A, 2W07X3Z, S66..."
1,name unit no admission date discharge date dat...,"[Anemia in chronic kidney disease, Disorder of...","[D631, S63266D, 4A003RD, E839, 0P9C0ZZ, N390, ..."
2,name unit no admission date discharge date dat...,[Other lymphoid leukemia not having achieved r...,"[N189, E1151, Z720, M1850, C91Z0, J1000, R7611..."
3,name unit no admission date discharge date dat...,"[Destruction of Stomach, Via Natural or Artifi...","[I130, S63655A, I2510, 0M9H4ZZ, 0Y6M0Z7, S8262..."
4,name unit no admission date discharge date dat...,"[Excision of Right Upper Leg Skin, External Ap...","[00FUXZZ, J122, T3452, S62667P, I4891, S63209A..."
...,...,...,...
10495,name unit no admission date discharge date dat...,"[Benign neoplasm of cerebral meninges, Hypothy...","[G8191, W241, R4701, D320, G936, E785, E039, G..."
10496,name unit no admission date discharge date dat...,[Reposition Left Upper Femur with Internal Fix...,"[0QS704Z, T288, W109XXA, Z85820, Y929, 0SRS0JZ..."
10497,name unit no admission date discharge date dat...,[Bypass Cerebral Ventricle to Peritoneal Cavit...,"[R251, S33120, G919, S5640, 00160J6, F941, Y92..."
10498,name unit no admission date discharge date dat...,[Reposition Right Tibia with Internal Fixation...,"[R079, T23172A, 02CX0ZZ, Z3A25, Z6841, 0QSG04Z..."


In [180]:
final_df.drop(columns=['distractors'], inplace=True)
final_df['responseTitles'] = final_df['response'].map(assign_title)
final_df['codeTitles'] = final_df['codes'].map(assign_title)

In [None]:
# realizza una funzione che possa mappare la colonna responseTitle e la colonna response in una
# nuova colonna response nella quale il contenuto è strutturato in questo modo:
# "response - response title"


In [181]:
final_df['response'] = final_df['response'].apply(lambda x: " | ".join(x))
final_df['codes'] = final_df['codes'].apply(lambda x: " | ".join(x))

In [182]:
final_df['input'] = final_df.pop('text')
final_df['output'] = final_df.pop('response')

In [186]:
final_df

Unnamed: 0,codes,input,output
0,Removal of Autologous Tissue Substitute from H...,name unit no admission date discharge date dat...,Insertion of Infusion Device into Superior Ven...
1,"Drainage of Right Humeral Head, Open Approach ...",name unit no admission date discharge date dat...,Anemia in chronic kidney disease | Disorder of...
2,Revision of Biventricular Short-term External ...,name unit no admission date discharge date dat...,Other lymphoid leukemia not having achieved re...
3,"Peripheral vascular disease, unspecified | Pur...",name unit no admission date discharge date dat...,"Destruction of Stomach, Via Natural or Artific..."
4,Parainfluenza virus pneumonia | Frostbite with...,name unit no admission date discharge date dat...,"Excision of Right Upper Leg Skin, External App..."
...,...,...,...
10495,"Contact with transmission devices, not elsewhe...",name unit no admission date discharge date dat...,Benign neoplasm of cerebral meninges | Hypothy...
10496,"Resection of Urethra, Percutaneous Endoscopic ...",name unit no admission date discharge date dat...,Reposition Left Upper Femur with Internal Fixa...
10497,Unspecified place or not applicable | Essentia...,name unit no admission date discharge date dat...,Bypass Cerebral Ventricle to Peritoneal Cavity...
10498,"Burn of first degree of left wrist, initial en...",name unit no admission date discharge date dat...,Reposition Right Tibia with Internal Fixation ...


In [185]:
final_df.to_csv("D:/TESI/Datasets/test_with_distractors_alpaca.csv", index=False)