In [1]:
! pip install medcat==1.3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Restart the runtime if on colab, sometimes necessary after installing models**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import json 

from matplotlib import pyplot as plt
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE
from tokenizers import ByteLevelBPETokenizer

 from tqdm.autonotebook import tqdm, trange


In [3]:
DATA_DIR = "./data/"
MODEL_DIR = "./models/"
model_pack_path = MODEL_DIR + "medmen_wstatus_2021_oct.zip"

In [4]:
# Download the models and required data
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/pt_notes.csv -P ./data/
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json -P ./data/

# Download the medcat modelpack
!wget https://medcat.rosalind.kcl.ac.uk/media/medmen_wstatus_2021_oct.zip -P ./models/


--2022-08-24 11:39:28-- https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/pt_notes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3644222 (3.5M) [text/plain]
Saving to: ‘./data/pt_notes.csv’


2022-08-24 11:39:28 (193 MB/s) - ‘./data/pt_notes.csv’ saved [3644222/3644222]

--2022-08-24 11:39:28-- https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272538 (266K) [text/plain]
Saving

In [5]:
# Load model pack and Create CAT - the main class from medcat used for concept annotation
# Create the full pipeline with models for meta-annotations
cat = CAT.load_model_pack(model_pack_path)

INFO:medcat:{
 "Model ID": null,
 "Last Modified On": null,
 "History (from least to most recent)": [],
 "Description": "No description",
 "Source Ontology": null,
 "Location": null,
 "MetaCAT models": {},
 "Basic CDB Stats": {},
 "Performance": {
 "ner": {},
 "meta": {}
 },
 "Important Parameters (Partial view, all available in cat.config)": {
 "config.ner['min_name_len']": {
 "value": 3,
 "description": "Minimum detection length (found terms/mentions shorter than this will not be detected)."
 },
 "config.ner['upper_case_limit_len']": {
 "value": 3,
 "description": "All detected terms shorter than this value have to be uppercase, otherwise they will be ignored."
 },
 "config.linking['similarity_threshold']": {
 "value": 0.2,
 "description": "If the confidence of the model is lower than this a detection will be ignore."
 },
 "config.general['spell_check']": {
 "value": true,
 "description": "Is spell checking enabled."
 },
 "config.general['spell_check_len_limit']": {
 "value": 7,
 "de

In [6]:
# Detect only Disease and Mental Disorders
# |T047|Disease or Syndrome
# |T048|Mental or Behavioral Dysfunction
type_ids_filter = ['T047', 'T048']
cui_filters = set()
for type_ids in type_ids_filter:
 cui_filters.update(cat.cdb.addl_info['type_id2cuis'][type_ids])
cat.cdb.config.linking['filters']['cuis'] = cui_filters

In [7]:
!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/pt_notes.csv -P ./data/

--2022-08-24 11:40:00-- https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/pt_notes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2022-08-24 11:40:01 ERROR 404: Not Found.



## Document annotation

The following is a replica of the document annotation code done in [Part 3.2.](https://colab.research.google.com/drive/1q29RbHlZoFK7TcvMKITi3ABbE-E_fw30), with the only change that we have meta-annotations in the pipeline now.

In [8]:
# Load the data 
data = pd.read_csv(DATA_DIR + "pt_notes.csv")
data.head()

Unnamed: 0,Unnamed: 0_x,subject_id,chartdate,category,text,create_year,Unnamed: 0_y,gender,dob,dob_year,age_year
0,6,1,2079-01-01,General Medicine,"HISTORY OF PRESENT ILLNESS:, The patient is a ...",2079,1,F,2018-01-01,2018,61
1,7,1,2079-01-01,Rheumatology,"HISTORY OF PRESENT ILLNESS: , A 71-year-old fe...",2079,1,F,2018-01-01,2018,61
2,8,1,2079-01-01,Consult - History and Phy.,"HISTORY OF PRESENT ILLNESS:, The patient is a ...",2079,1,F,2018-01-01,2018,61
3,9,2,2037-01-01,Consult - History and Phy.,"CHIEF COMPLAINT:,1. Infection.,2. Pelvic pai...",2037,2,F,2018-01-01,2018,19
4,10,2,2037-01-01,Dermatology,"SUBJECTIVE:, This is a 29-year-old Vietnamese...",2037,2,F,2018-01-01,2018,19


In [9]:
data.shape

(1088, 11)

In [10]:
# format the df to match: required input data for multiprocessing = [(doc_id, doc_text), (doc_id, doc_text), ...]
def data_iterator(data):
 for id, row in data[['text']].iterrows():
 yield (id, str(row['text']))

In [11]:
# Set a batch size to control for the variablity between document sizes
batch_size_chars = 500000 # Batch size (BS) in number of characters

# Run model
results = cat.multiprocessing(data_iterator(data), # Formatted data
 batch_size_chars = batch_size_chars,
 nproc=8) # Number of processors

INFO:medcat:Annotated until now: 0 docs; Current BS: 157 docs; Elapsed time: 0.00 minutes
INFO:medcat:Annotated until now: 157 docs; Current BS: 165 docs; Elapsed time: 1.18 minutes
INFO:medcat:Annotated until now: 322 docs; Current BS: 151 docs; Elapsed time: 2.36 minutes
INFO:medcat:Annotated until now: 473 docs; Current BS: 160 docs; Elapsed time: 3.54 minutes
INFO:medcat:Annotated until now: 633 docs; Current BS: 152 docs; Elapsed time: 4.68 minutes
INFO:medcat:Annotated until now: 785 docs; Current BS: 130 docs; Elapsed time: 5.85 minutes
INFO:medcat:Annotated until now: 915 docs; Current BS: 157 docs; Elapsed time: 7.03 minutes
INFO:medcat:Annotated until now: 1072 docs; Current BS: 16 docs; Elapsed time: 8.21 minutes


In [12]:
# This will be a map from CUI to a list of documents where it appears: {"cui": [<doc_id>, <doc_id>, ...], ..}
cui_location = {}
# Let's also save the TUI location (semantic type)
type_ids_location = {}

In [13]:
# Select only cui's with the Status Meta-annotation "Affirmed" and thier corresponding documents
for doc in list(results.keys()):
 for annotation in list(results[doc]['entities'].values()):
 if annotation['cui'] in cui_location:
 if annotation['meta_anns']['Status']['value'] == 'Affirmed':
 cui_location[annotation['cui']].append(doc)
 else:
 if annotation['meta_anns']['Status']['value'] == 'Affirmed':
 cui_location[annotation['cui']] = [doc]

In [14]:
# For the type_ids and their corresponding documents
# Remember that a cui may map to more than one type_ids (one to many mapping)
for cui in cui_location.keys():
 type_ids_location[list(cat.cdb.cui2type_ids[cui])[0]] = cui_location[cui]

## Done

We have now annotated all documents in our dataset and for each CUI (Concept identifier) we know in which document it appers. We also know that all the detected entities have the status "Affirmed". 

---

Please note that the number of examples I have provided is not enough to fully train the "Status" meta-annotation and one would need to provide more.

In [15]:
# For example, the concept with CUI: C0020538 (hypertension) appears in
set(cui_location['C0020538'])

{0,
 1,
 2,
 8,
 9,
 10,
 11,
 12,
 13,
 20,
 21,
 22,
 23,
 24,
 25,
 35,
 36,
 39,
 40,
 41,
 43,
 44,
 45,
 47,
 51,
 52,
 53,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 79,
 80,
 81,
 83,
 92,
 93,
 94,
 99,
 100,
 101,
 102,
 105,
 107,
 108,
 110,
 111,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 123,
 128,
 129,
 130,
 131,
 133,
 135,
 139,
 154,
 163,
 166,
 168,
 169,
 170,
 172,
 175,
 177,
 187,
 188,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 204,
 206,
 207,
 208,
 227,
 228,
 229,
 230,
 231,
 233,
 235,
 239,
 241,
 242,
 244,
 248,
 250,
 252,
 253,
 269,
 273,
 274,
 277,
 282,
 287,
 288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 298,
 307,
 308,
 309,
 310,
 311,
 312,
 319,
 320,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 340,
 362,
 363,
 364,
 365,
 366,
 367,
 368,
 369,
 370,
 371,
 372,
 373,
 374,
 375,
 377,
 392,
 394,
 403,
 405,
 406,
 409,
 413,
 414,
 415,
 418,
 419,
 420,
 421,
 422,
 

In [16]:
# Save concept location in corpus
import json
json.dump(cui_location, open("./cui_location.json", 'w'))

End of tutorial