In [None]:
! pip install medcat==1.2.3

**Restart the runtime if on colab, sometimes necessary after installing models**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import json 

from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE
from tokenizers import ByteLevelBPETokenizer

  from tqdm.autonotebook import tqdm


In [None]:
DATA_DIR = "./data/"
MODEL_DIR = "./models/"
model_pack_path = MODEL_DIR + "medmen_wstatus_2021_oct.zip"

In [None]:
# Download the models and required data
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/pt_notes.csv -P ./data/
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json -P ./data/

# Download the medcat modelpack
!wget https://medcat.rosalind.kcl.ac.uk/media/medmen_wstatus_2021_oct.zip -P ./models/



2021-10-24 21:01:49 (18.3 MB/s) - ‘./models/medmen_wstatus_2021_oct.zip’ saved [561947681/561947681]



In [None]:
# Load model pack and Create CAT - the main class from medcat used for concept annotation
# Create the full pipeline with models for meta-annotations
cat = CAT.load_model_pack(model_pack_path)

Unziping the model pack and loading models.


In [None]:
# Detect only Disease and Mental Disorders
# |T047|Disease or Syndrome
# |T048|Mental or Behavioral Dysfunction
type_ids_filter = ['T047', 'T048']
cui_filters = set()
for type_ids in type_ids_filter:
  cui_filters.update(cat.cdb.addl_info['type_id2cuis'][type_ids])
cat.cdb.config.linking['filters']['cuis'] = cui_filters

In [None]:
!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/pt_notes.csv -P ./data/

--2021-10-24 21:05:03--  https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/pt_notes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3644222 (3.5M) [text/plain]
Saving to: ‘./data/pt_notes.csv.1’


2021-10-24 21:05:03 (48.7 MB/s) - ‘./data/pt_notes.csv.1’ saved [3644222/3644222]



## Document annotation

The following is a replica of the document annotation code done in [Part 3.2.](https://colab.research.google.com/drive/1q29RbHlZoFK7TcvMKITi3ABbE-E_fw30), with the only change that we have meta-annotations in the pipeline now.

In [None]:
# Load the data 
data = pd.read_csv(DATA_DIR + "pt_notes.csv")
data.head()

Unnamed: 0,Unnamed: 0_x,subject_id,chartdate,category,text,create_year,Unnamed: 0_y,gender,dob,dob_year,age_year
0,6,1,2079-01-01,General Medicine,"HISTORY OF PRESENT ILLNESS:, The patient is a ...",2079,1,F,2018-01-01,2018,61
1,7,1,2079-01-01,Rheumatology,"HISTORY OF PRESENT ILLNESS: , A 71-year-old fe...",2079,1,F,2018-01-01,2018,61
2,8,1,2079-01-01,Consult - History and Phy.,"HISTORY OF PRESENT ILLNESS:, The patient is a ...",2079,1,F,2018-01-01,2018,61
3,9,2,2037-01-01,Consult - History and Phy.,"CHIEF COMPLAINT:,1. Infection.,2. Pelvic pai...",2037,2,F,2018-01-01,2018,19
4,10,2,2037-01-01,Dermatology,"SUBJECTIVE:, This is a 29-year-old Vietnamese...",2037,2,F,2018-01-01,2018,19


In [None]:
data.shape

(1088, 11)

In [None]:
# format the df to match: required input data for multiprocessing = [(doc_id, doc_text), (doc_id, doc_text), ...]
def data_iterator(data):
    for id, row in data[['text']].iterrows():
        yield (id, str(row['text']))

In [None]:
# Set a batch size to control for the variablity between document sizes
batch_size_chars = 500000 # Batch size (BS) in number of characters

# Run model
results = cat.multiprocessing(data_iterator(data),  # Formatted data
                              batch_size_chars = batch_size_chars,
                              nproc=8) # Number of processors

In [None]:
# This will be a map from CUI to a list of documents where it appears: {"cui": [<doc_id>, <doc_id>, ...], ..}
cui_location = {}
# Let's also save the TUI location (semantic type)
type_ids_location = {}

In [None]:
# Select only cui's with the Status Meta-annotation "Affirmed" and thier corresponding documents
for doc in list(results.keys()):
  for annotation in list(results[doc]['entities'].values()):
    if annotation['cui'] in cui_location:
      if annotation['meta_anns']['Status']['value'] == 'Affirmed':
        cui_location[annotation['cui']].append(doc)
    else:
      if annotation['meta_anns']['Status']['value'] == 'Affirmed':
        cui_location[annotation['cui']] = [doc]

In [None]:
# For the type_ids and their corresponding documents
# Remember that a cui may map to more than one type_ids (one to many mapping)
for cui in cui_location.keys():
  type_ids_location[list(cat.cdb.cui2type_ids[cui])[0]] = cui_location[cui]

## Done

We have now annotated all documents in our dataset and for each CUI (Concept identifier) we know in which document it appers. We also know that all the detected entities have the status "Affirmed". 

---

Please note that the number of examples I have provided is not enough to fully train the "Status" meta-annotation and one would need to provide more.

In [None]:
# For example, the concept with CUI: C0020538 (hypertension) appears in
set(cui_location['C0020538'])

In [None]:
# Save concept location in corpus
import json
json.dump(cui_location, open("./cui_location.json", 'w'))

End of tutorial