In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/SCH_Proposal

/content/drive/My Drive/Colab Notebooks/SCH_Proposal


## Clone stanza

In [None]:
!git clone https://github.com/stanfordnlp/stanza.git

Cloning into 'stanza'...
remote: Enumerating objects: 17013, done.[K
remote: Counting objects: 100% (1260/1260), done.[K
remote: Compressing objects: 100% (460/460), done.[K
remote: Total 17013 (delta 919), reused 1039 (delta 795), pack-reused 15753[K
Receiving objects: 100% (17013/17013), 8.57 MiB | 5.45 MiB/s, done.
Resolving deltas: 100% (12399/12399), done.
Checking out files: 100% (290/290), done.


In [None]:
!pip3 install -q git+https://github.com/stanfordnlp/stanza.git

[?25l[K     |██                              | 10 kB 37.8 MB/s eta 0:00:01[K     |███▉                            | 20 kB 43.7 MB/s eta 0:00:01[K     |█████▉                          | 30 kB 49.2 MB/s eta 0:00:01[K     |███████▊                        | 40 kB 51.5 MB/s eta 0:00:01[K     |█████████▋                      | 51 kB 37.5 MB/s eta 0:00:01[K     |███████████▋                    | 61 kB 39.8 MB/s eta 0:00:01[K     |█████████████▌                  | 71 kB 33.6 MB/s eta 0:00:01[K     |███████████████▍                | 81 kB 35.9 MB/s eta 0:00:01[K     |█████████████████▍              | 92 kB 38.1 MB/s eta 0:00:01[K     |███████████████████▎            | 102 kB 40.1 MB/s eta 0:00:01[K     |█████████████████████▏          | 112 kB 40.1 MB/s eta 0:00:01[K     |███████████████████████▏        | 122 kB 40.1 MB/s eta 0:00:01[K     |█████████████████████████       | 133 kB 40.1 MB/s eta 0:00:01[K     |███████████████████████████     | 143 kB 40.1 MB/s eta 0:

## https://stanfordnlp.github.io/stanza/biomed_model_usage.html

In [None]:
!pip3 install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [None]:
import stanza
import pandas as pd
import numpy as np
from collections import defaultdict
import os
import pickle as pk
import jsonlines
import json

## Method to extract named entities from a clinical note

In [None]:
def _extract_named_entities(clinical_note, i2b2_model):
  lst_entities = []

  doc_i2b2 = i2b2_model(clinical_note)
  for ent in doc_i2b2.entities:
    lst_entities.append(ent.text)
  
  lst_entities = list(set(lst_entities))
  str_entities = " | ".join(lst_entities)
  
  return str_entities
  

### Main method for heart failure data from UIC

In [None]:
def main():
  input_fileName = "heart_failure_procedures.csv"
  output_fileName = "heart_failure_procedures_w_named_entites.jsonl"

  # download and initialize a mimic pipeline with an i2b2 NER model
  stanza.download('en', package='mimic', processors={'ner': 'i2b2'})
  nlp_i2b2 = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})

  # read the dataframe with findings and impressions
  dict_w_named_entities = defaultdict(list)
  df = pd.read_csv(input_fileName)

  for idx, row in df.iterrows():
    if idx % 1000 == 0:
      print("Iteration: ", idx)
    record_id = str(df['record_id'][idx])
    procedure_name = str(df['procedure_name'][idx])
    finding = str(df['finding'][idx])
    impression = str(df['impression'][idx])

    # call to the named entity extractor method for both finding and impression
    finding_entities = _extract_named_entities(finding, nlp_i2b2)
    impression_entities = _extract_named_entities(impression, nlp_i2b2)

    dict_1 = {"record_id" : record_id,
              "procedure_name" : procedure_name,
              "finding" : finding,
              "impression" : impression,
              "finding_entities" : finding_entities,
              "impression_entities" : impression_entities
              }

    with jsonlines.open(output_fileName, "a") as writer:
      writer.write(dict_1)
    writer.close()



In [None]:
if __name__ == "__main__":
  main()

### Main method for ECGEN data from Indiana Univesity

In [None]:
def main():
  input_fileName = "NLMCXR_reports_ecgen_radiology.jsonl"
  output_fileName = "NLMCXR_reports_ecgen_radiology_w_named_entites.jsonl"   # named entities for both findings and impressions

  # download and initialize a mimic pipeline with an i2b2 NER model
  stanza.download('en', package='mimic', processors={'ner': 'i2b2'})
  nlp_i2b2 = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})

  with open(input_fileName) as fp:
    for iter, line in enumerate(fp):
      if iter % 1000 == 0:
        print("Iteration: ", iter)
      dict_data = json.loads(line)

      indication = str(dict_data["INDICATION"])
      finding = str(dict_data["FINDINGS"])
      impression = str(dict_data["IMPRESSION"])

      # call to the named entity extractor method for both finding and impression
      finding_entities = _extract_named_entities(finding, nlp_i2b2)
      impression_entities = _extract_named_entities(impression, nlp_i2b2)

      dict_1 = {"indication" : indication,
                "finding" : finding,
                "impression" : impression,
                "finding_entities" : finding_entities,
                "impression_entities" : impression_entities
                }

      with jsonlines.open(output_fileName, "a") as writer:
        writer.write(dict_1)
      writer.close()

  fp.close()

In [None]:
if __name__ == "__main__":
  main()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2021-12-18 08:22:14 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package |
-----------------------------
| tokenize        | mimic   |
| pos             | mimic   |
| lemma           | mimic   |
| depparse        | mimic   |
| ner             | i2b2    |
| forward_charlm  | mimic   |
| pretrain        | mimic   |
| backward_charlm | mimic   |

2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/tokenize/mimic.pt.
2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/pos/mimic.pt.
2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/lemma/mimic.pt.
2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/depparse/mimic.pt.
2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/ner/i2b2.pt.
2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/forward_charlm/mimic.pt.
2021-12-18 08:22:14 INFO: File exists: /root/stanza_resources/en/pretrain/mimic.pt.
2021-12-18 08:22:15 INFO:

Iteration:  0
Iteration:  1000
Iteration:  2000
Iteration:  3000
