In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
MIMIC_PATH = "/content/drive/MyDrive/MIMIC-IV/files/mimiciv/2.2/"
MIMIC_NOTE_PATH = "/content/drive/MyDrive/MIMIC-IV/files/mimic-iv-note/2.2/"

Mounted at /content/drive


# IMPORT CODICI E FILTRAGGIO

In [2]:
d_icd_diagnoses = pd.read_csv(MIMIC_PATH + "hosp/d_icd_diagnoses.csv.gz", compression="gzip")
d_icd10_diagnoses = d_icd_diagnoses[d_icd_diagnoses.icd_version==10]

mimicCodes = set(d_icd10_diagnoses["icd_code"].apply(lambda x: x.replace('.', '')))

In [3]:
def read_order(file_path):
    dati = []
    with open(file_path, 'r') as file:
        for line in file:
            splitted = list(filter(None, line.split(" ")))[1:3]
            dati.append((splitted[0], True if splitted[1] == '1' else False))
    df = pd.DataFrame(dati, columns=['Codice', 'Flag'])
    return df

In [4]:
file_path = '/content/drive/MyDrive/Data/2020order.txt'
assignableCodes = read_order(file_path)
assignableCodes = set(assignableCodes[assignableCodes["Flag"] == True]["Codice"])
filtered_mimicCodes = mimicCodes.intersection(assignableCodes)
codes = list(filtered_mimicCodes)

In [5]:
del d_icd_diagnoses, assignableCodes, filtered_mimicCodes

# FUNZIONI UTILI

In [6]:
import re

def preprocess_text(
    text: str,
    lower: bool = True,
    remove_special_characters_mullenbach: bool = True,
    remove_special_characters: bool = False,
    remove_digits: bool = True,
    remove_accents: bool = False,
    remove_brackets: bool = False,
    convert_danish_characters: bool = False
) -> str:
    if lower:
        text = text.lower()
    if convert_danish_characters:
        text = re.sub("å", "aa", text)
        text = re.sub("æ", "ae", text)
        text = re.sub("ø", "oe", text)
    if remove_accents:
        text = re.sub("é|è|ê", "e", text)
        text = re.sub("á|à|â", "a", text)
        text = re.sub("ô|ó|ò", "o", text)
    if remove_brackets:
        text = re.sub("\[[^]]*\]", "", text)
    if remove_special_characters:
        text = re.sub("\n|/|-", " ", text)
        text = re.sub("[^a-zA-Z0-9 .]", "", text)  # Mantiene i punti
    if remove_special_characters_mullenbach:
        text = re.sub("[^A-Za-z0-9.]+", " ", text)  # Mantiene i punti
    if remove_digits:
        text = re.sub("(\s\d+)+\s", " ", text)

    text = re.sub("\s+", " ", text)
    text = text.strip()

    return text

In [7]:
# DICTIONARY CODE -> DESCRIPTION
cod2lbl = {}
for key, value in zip(d_icd10_diagnoses.icd_code, d_icd10_diagnoses.long_title):
    cod2lbl[key]=value

# DICTIONARY DESCRIPTION -> CODE
lbl2cod = {}
for key, value in zip(d_icd10_diagnoses.long_title, d_icd10_diagnoses.icd_code):
    lbl2cod[key]=value

# FUNCTION TO ASSIGN DESCRIPTION TO CODES
def assign_title(x):
    return [cod2lbl[el.replace('.', '')] for el in x]

# FUNCTION TO ASSIGN CODES TO DESCRIPTION
def assign_codes(x):
    return [lbl2cod[el] for el in x]

In [8]:
del d_icd10_diagnoses

In [9]:
def textToSentences(text):
  sentences = sent_tokenize(text)
  new_sentences = []
  for sentence in sentences:
    if len(sentence.split()) > 30:
      words = sentence.split()
      sub_sentences = [" ".join(words[i:i+10]) for i in range(0, len(words), 10)]
    else:
      new_sentences.append(sentence)
  return new_sentences

# IMPORT NOTE

In [10]:
# MIMIC-IV SPLITS
split = pd.read_feather("/content/drive/MyDrive/MIMIC-IV-SPLIT/mimiciv_icd10/mimiciv_icd10_split.feather")
train = split[split["split"] == "train"]
val = split[split["split"] == "val"]
test = split[split["split"] == "test"]

In [11]:
# MIMIC-IV ELABORATED TABLES
icd10_df = pd.read_feather("/content/drive/MyDrive/Data/mimiciv_icd10.feather")

#icd10_train_df = icd10_df[icd10_df['_id'].isin(train['_id'])].reset_index(drop=True)
#icd10_val_df = icd10_df[icd10_df['_id'].isin(val['_id'])].reset_index(drop=True)
icd10_test_df = icd10_df[icd10_df['_id'].isin(test['_id'])].reset_index(drop=True)

del icd10_df
icd10_df = icd10_test_df


In [12]:
test = icd10_df.sample(20, random_state=42)

In [13]:
del icd10_test_df, icd10_df

In [14]:
import gc
gc.collect()

0

# IMPORT SCISPACY

In [1]:
%%capture
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

import spacy
import scispacy
from scispacy.linking import EntityLinker

In [2]:
THRESHOLD = 0.999999
# Carica un modello pre-addestrato
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", "threshold" : THRESHOLD})
linker = nlp.get_pipe("scispacy_linker")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# IMPORT TOKENIZER E SENTENCE TRANSFORMER

In [17]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
%%capture
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('NeuML/pubmedbert-base-embeddings')

# SELEZIONE CODICI

In [None]:
titles = [cod2lbl[x] for x in codes]
embeddings2 = model.encode(titles)

In [None]:
meanPercentage = 0
meanNumCodes = 0

for index, row in test.iterrows():
  selectedCodes = set()
  targetNames = set(assign_title(list(row["icd10_diag"])))
  note = row["raw_text"]
  #sentences = textToSentences(note)



  entities = set()

  noteEntities = set(nlp(note).ents)
  for entity in noteEntities:
   ents = entity._.kb_ents
   for umls_ent in ents:
     x = linker.kb.cui_to_entity[umls_ent[0]]
     name = x.canonical_name
     entities.add(name)
     synonyms = x.aliases
     for synonym in synonyms:
       entities.add(synonym)

  #for sentence in sentences:
  #    sentenceEntities = set(nlp(sentence).ents)
  #    for word in sentence.split():
  #      sentenceEntities.update(set(nlp(word).ents))
  #    for entity in sentenceEntities:
  #      ents = entity._.kb_ents
  #      for umls_ent in ents:
  #        x = linker.kb.cui_to_entity[umls_ent[0]]
  #        name = x.canonical_name
  #        entities.add(name)
  #
  #        synonyms = x.aliases
  #        for synonym in synonyms:
  #          entities.add(synonym)

  entities = list(entities)
  embeddings1 = model.encode(entities)

  similarity = model.similarity(embeddings1, embeddings2)

  maxvals = np.zeros(similarity.shape[1])
  for i in range(similarity.shape[1]):
    colonna = similarity[:, i].cpu().numpy()
    maxvals[i] = np.max(colonna)

  maxvals = np.argsort(maxvals)[::-1][:100]
  selectedCodes = set([titles[i] for i in maxvals])

  intersection = targetNames.intersection(selectedCodes)
  percentage = (len(intersection) / len(targetNames)) * 100
  print(f"Percentage of targetNames in selectedCodes: {percentage:.2f}%")

  print(f"num of selected codes: {len(selectedCodes)}")

  meanPercentage += percentage
  meanNumCodes += len(selectedCodes)

print(f"mean percentage: {meanPercentage / len(test)}")
print(f"mean num of selected codes: {meanNumCodes / len(test)}")