In [1]:
# Install medcat
! pip install medcat==1.3.0
# Get the spacy model
! python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.1.0
 Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl (45.4 MB)
[K |████████████████████████████████| 45.4 MB 45.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


**Restart the runtime if on colab, sometimes necessary after installing models**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import json 

from matplotlib import pyplot as plt
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT
from tokenizers import ByteLevelBPETokenizer


 from tqdm.autonotebook import tqdm, trange


In [3]:
DATA_DIR = "./data/"
vocab_path = DATA_DIR + "vocab.dat"
cdb_path = DATA_DIR + "cdb-medmen-v1_2.dat"

In [4]:
# Download the models and required data
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json -P ./data/
# You can also use the models created in Part 4.1 of the Tutorial
!wget https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip -P ./data/

# Get MedCAT models components (Alternatively you can use a previously created MedCAT model packs)
!wget https://medcat.rosalind.kcl.ac.uk/media/vocab.dat -P ./data/
!wget https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1_2.dat -P ./data/

--2022-08-24 11:24:03-- https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272538 (266K) [text/plain]
Saving to: ‘./data/MedCAT_Export.json’


2022-08-24 11:24:03 (8.14 MB/s) - ‘./data/MedCAT_Export.json’ saved [272538/272538]

--2022-08-24 11:24:04-- https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip
Resolving medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)... 193.61.202.225
Connecting to medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)|193.61.202.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37947480 (36M) [application/zip]
Saving to: ‘./data/mc_status.zip’


2022-08-24 11:24:06 (18.7 MB/s) - ‘./data/mc_s

## MedCATtrainer Export

In [5]:
#@title
data = json.load(open(DATA_DIR + "MedCAT_Export.json"))

In [6]:
#@title
print(data.keys())

dict_keys(['projects'])


In [7]:
#@title
data['projects'][0].keys()

dict_keys(['name', 'id', 'cuis', 'tuis', 'documents'])

In [8]:
#@title
data['projects'][0]['documents'][0].keys()

dict_keys(['id', 'name', 'text', 'last_modified', 'annotations'])

In [9]:
#@title
data['projects'][0]['documents'][0]['annotations'][0].keys()

dict_keys(['id', 'user', 'cui', 'value', 'start', 'end', 'validated', 'correct', 'deleted', 'alternative', 'killed', 'last_modified', 'manually_created', 'acc', 'meta_anns'])

In [10]:
#@title
data['projects'][0]['documents'][0]['annotations'][0]['meta_anns'][0].keys()

dict_keys(['name', 'value', 'acc', 'validated'])

##Fine-tuning the NER+L model

First we load the existing MedCAT models that we will fine-tune.



In [11]:
# Create and load the CDB (Concept Database)
cdb = CDB.load(cdb_path)

# Create and load the Vocabulary
vocab = Vocab.load(vocab_path)

# Setup config
config = Config()
config.general['spacy_model'] = 'en_core_web_md'

# Create CAT - the main class from medcat used for concept annotation
cat = CAT(cdb=cdb, config=config, vocab=vocab)



To fine-tune the existing models we use the `train_supervised()` method from MedCAT. The method has the following options:


---



`data_path` - Path to the JSON file exported from MedCATtrainer

`reset_cui_count` - Each cui has an internal counter that is used for weight decay, this will reset it.

`nepochs` - Numeber of epochs

`reset_cui_count` - Used for training with weight_decay (annealing). Each concept has a count that is there from the beginning of the CDB, that count is used for annealing. Resetting the count will significantly increase the training impact. This will reset the count only for concepts that exist in the the training data.

`print_stats` - Print statistics during training (F1/P/R)

`test_set` - Provide another file for testing

`use_filters` - During the training do you want to set the cui/tui filters from the medcattrainer project configuration. 

`never_terminate` - If True no termination of concepts will be applied
 
`terminate_last` - If true, concept termination will be done after all training.

`use_groups` - If True concepts that have groups will be combined and stats will be reported on groups.

`use_overlaps` - Allow overlapping entites, nearly always False as it is very difficult to annotate overlapping entites.
 
`use_cui_doc_limit` - If True the metrics for a CUI will be only calculated if that CUI appears in a document.
 
`train_from_false_positives` - If True it will use false positive examples detected by medcat and train from them as negative examples.



In [12]:
cat.train_supervised(data_path=DATA_DIR + "MedCAT_Export.json", 
 nepochs=1,
 reset_cui_count=False,
 print_stats=True, 
 use_filters=True) 

INFO:medcat:Running without a test set, or train==test


Stats project: 0%| | 0/1 [00:00<?, ?it/s]

Stats document: 0%| | 0/27 [00:00<?, ?it/s]

Epoch: 0, Prec: 0.7948717948717948, Rec: 0.7828282828282829, F1: 0.7888040712468194

Docs with false positives: 1737; 1734; 899; 466; 1687; 1383; 2124; 898; 323; 1694

Docs with false negatives: 1737; 1734; 466; 1383; 73; 1070; 898; 323; 1694; 641



False Positives

Diabetes - C0011847 - 18
Obesity - C0028754 - 5
Hypertensive disease - C0020538 - 5
nervous system disorder - C0027765 - 4
Disease - C0012634 - 3
Alzheimer's Disease - C0002395 - 3
Anxiety Disorders - C0003469 - 3
Kidney Diseases - C0022658 - 3
Coronary Arteriosclerosis - C0010054 - 2
Myocardial Diseases, Secondary - C0036529 - 2


False Negatives

disorder lesions skin - C0037284 - 13
Diabetes Mellitus, Non-Insulin-Dependent - C0011860 - 9
Diabetes Mellitus - C0011849 - 7
Lymphadenopathy - C0497156 - 6
Heart Diseases - C0018799 - 4
Arthritis - C0003864 - 3
Pseudomonas aeruginosa infection - C0854135 - 3
Pulmonary Embolism - C0034065 - 3
Renal Insufficiency - C1565489 - 3
Coronary heart disease - C0010068 - 2


True Positi

Epoch: 0%| | 0/1 [00:00<?, ?it/s]

Project: 0%| | 0/1 [00:00<?, ?it/s]

Document: 0%| | 0/27 [00:00<?, ?it/s]

Stats project: 0%| | 0/1 [00:00<?, ?it/s]

Stats document: 0%| | 0/27 [00:00<?, ?it/s]

Epoch: 1, Prec: 0.8439024390243902, Rec: 0.8737373737373737, F1: 0.858560794044665

Docs with false positives: 1737; 466; 1687; 1383; 2124; 898; 323; 1694; 641; 96

Docs with false negatives: 1605; 1070; 1737; 611; 688; 323; 716; 1881; 1377; 466



False Positives

Diabetes - C0011847 - 15
Hypertensive disease - C0020538 - 5
Diabetes Mellitus - C0011849 - 4
Disease - C0012634 - 4
Cognition Disorders - C0009241 - 3
Anxiety Disorders - C0003469 - 3
Coronary Arteriosclerosis - C0010054 - 2
Myocardial Diseases, Secondary - C0036529 - 2
Diabetes Mellitus, Non-Insulin-Dependent - C0011860 - 2
disorder lesions skin - C0037284 - 2


False Negatives

Diabetes Mellitus, Non-Insulin-Dependent - C0011860 - 9
Diabetes Mellitus - C0011849 - 7
Pseudomonas aeruginosa infection - C0854135 - 3
Pulmonary Embolism - C0034065 - 3
Coronary heart disease - C0010068 - 2
Arthritis - C0003864 - 2
Syndrome - C0039082 - 2
disorder lesions skin - C0037284 - 2
Ulcer - C0041582 - 2
Diabetes Mellitus, Insulin-Depende

({'C0011847': 15,
 'C0020538': 5,
 'C0011849': 4,
 'C0012634': 4,
 'C0009241': 3,
 'C0003469': 3,
 'C0010054': 2,
 'C0036529': 2,
 'C0011860': 2,
 'C0037284': 2,
 'C0233794': 2,
 'C0276289': 2,
 'C0349782': 2,
 'C0029456': 2,
 'C0028754': 1,
 'C0038443': 1,
 'C0002792': 1,
 'C0009319': 1,
 'C0018939': 1,
 'C0339573': 1,
 'C1704436': 1,
 'C0038454': 1,
 'C0018801': 1,
 'C0023351': 1,
 'C0029408': 1,
 'C0030567': 1,
 'C0031099': 1,
 'C0519066': 1},
 {'C0011860': 9,
 'C0011849': 7,
 'C0854135': 3,
 'C0034065': 3,
 'C0010068': 2,
 'C0003864': 2,
 'C0039082': 2,
 'C0037284': 2,
 'C0041582': 2,
 'C0011854': 2,
 'C0018939': 1,
 'C0238792': 1,
 'C0008679': 1,
 'C1301700': 1,
 'C0021167': 1,
 'C0009324': 1,
 'C0018799': 1,
 'C0002871': 1,
 'C3844825': 1,
 'C0018889': 1,
 'C0033377': 1,
 'C0042769': 1,
 'C0263746': 1,
 'C0206172': 1,
 'C0021400': 1,
 'C0085762': 1},
 {'C0020538': 43,
 'C0037284': 19,
 'C0020473': 12,
 'C0010054': 10,
 'C0497156': 10,
 'C0011570': 10,
 'C0041834': 9,
 'C0018802':

In [13]:
# If we want to know the F1, P, R for each cui, we can call the stats method
data = json.load(open(DATA_DIR + "MedCAT_Export.json"))
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, extra_cui_filter=True)

Stats project: 0%| | 0/1 [00:00<?, ?it/s]

Stats document: 0%| | 0/27 [00:00<?, ?it/s]

Epoch: 0, Prec: 0.07726965532777652, Rec: 0.8511166253101737, F1: 0.1416769929781082

Docs with false positives: 1737; 1734; 899; 466; 1687; 1383; 73; 1070; 2124; 898

Docs with false negatives: 1737; 466; 1383; 73; 1070; 323; 1694; 641; 96; 496



False Positives

Patients - C0030705 - 200
Normal - C0205307 - 67
Medical History - C0262926 - 57
Daily - C0332173 - 46
year - C0439234 - 41
Bilateral - C0238767 - 40
Right - C0205090 - 33
Pain - C0030193 - 28
Physical Examination - C0031809 - 27
Lung - C0024109 - 25


False Negatives

Diabetes Mellitus, Non-Insulin-Dependent - C0011860 - 9
Diabetes Mellitus - C0011849 - 7
disorder lesions skin - C0037284 - 4
Pseudomonas aeruginosa infection - C0854135 - 3
Pulmonary Embolism - C0034065 - 3
Coronary heart disease - C0010068 - 2
Arthritis - C0003864 - 2
C1561826 - C1561826 - 2
Syndrome - C0039082 - 2
Ulcer - C0041582 - 2


True Positives

Hypertensive disease - C0020538 - 43
disorder lesions skin - C0037284 - 17
Hyperlipidemia - C0020473 - 12


In [14]:
# Now we can check F1/P/R for a cui using
cui = "C0020538" # Hypertension
print(cui_f1[cui], cui_prec[cui], cui_rec[cui])

0.9662921348314606 0.9347826086956522 1.0


In [15]:
# inspect fn, fp examples for specific CUIs of interest...
examples['fn']['C0010068']

[{'text': ' appendectomy, oophorectomy.,FAMILY HISTORY: , Positive for coronary artery disease in her father and brother in their 40s.,SOCIAL HISTORY: , S',
 'cui': 'C0010068',
 'source value': 'coronary artery disease',
 'acc': 1,
 'project index': 0,
 'document inedex': 1},
 {'text': 'on fraction of 20%-25% in December 2005, COPD, mild diffuse coronary artery disease, and renal insufficiency.,ALLERGIES:, NO KNOWN DRUG ALLERGI',
 'cui': 'C0010068',
 'source value': 'coronary artery disease',
 'acc': 1,
 'project index': 0,
 'document inedex': 25}]

In [16]:
# annotations inside first projects...
[anno for d in data['projects'][0]['documents'] for anno in d['annotations']][0:5]

[{'id': 45580,
 'user': 'wish',
 'cui': 'C0017168',
 'value': 'gastroesophageal reflux',
 'start': 332,
 'end': 355,
 'validated': True,
 'correct': True,
 'deleted': False,
 'alternative': False,
 'killed': False,
 'last_modified': '2020-04-01 22:06:34.303633+00:00',
 'manually_created': False,
 'acc': 1.0,
 'meta_anns': [{'name': 'Status',
 'value': 'Other',
 'acc': 1.0,
 'validated': True}]},
 {'id': 45581,
 'user': 'wish',
 'cui': 'C0020538',
 'value': 'hypertension',
 'start': 255,
 'end': 267,
 'validated': True,
 'correct': True,
 'deleted': False,
 'alternative': False,
 'killed': False,
 'last_modified': '2020-04-01 22:06:30.394941+00:00',
 'manually_created': False,
 'acc': 1.0,
 'meta_anns': [{'name': 'Status',
 'value': 'Confirmed',
 'acc': 1.0,
 'validated': True}]},
 {'id': 45582,
 'user': 'wish',
 'cui': 'C0012634',
 'value': 'disorder',
 'start': 356,
 'end': 364,
 'validated': True,
 'correct': False,
 'deleted': False,
 'alternative': False,
 'killed': True,
 'last_mo

##MetaAnnotations

During the annotation process we have created only one meta-annotation called "Status". Here we are going to train a BiLSTM to detect that meta-annotation.

### MetaCAT

The class we are going to use to train meta-annotations is called MetaCAT. As input it takes:

`tokenizer` - A BBPE tokenizer from [huggingface](https://github.com/huggingface/tokenizers)

`embeddings` - The precalculated embeddings for the tokens produced by the tokenizer. Can be anything from Word2Vec to BERT. This is a numpy matrix, or python list of embeddings. 

`cntx_left` - Size of context from the left side of the entity that will be taken into account.

`cntx_right` - Size of context from the right side of the entity that will be taken into account.

`save_dir` - Where do we want to save the trained models.

`pad_id` - Padding index in the embeddings matrix. 

`device` - On which device to run this `cpu` or `cuda`

In [17]:
!unzip data/mc_status.zip

Archive: data/mc_status.zip
 creating: Status/
 inflating: Status/model.dat 
 inflating: Status/config.json 
 inflating: Status/bbpe-vocab.json 
 inflating: Status/bbpe-merges.txt 


In [18]:
# Get the required tokenizer (note that we have already downloaded the required models)
mc = MetaCAT.load('Status')

### MetaCAT configuration

For a full list of all the configurable parameter. Follow this [link](https://github.com/CogStack/MedCAT/blob/master/medcat/config_meta_cat.py).

Some noteable parameters:

`category_name` - What is the name of this meta-annotation (same as the name in the MedCATtrainer)

`model_name` - for now only `lstm`

`lr` - Learning rate

`test_size` - Proportion of the test set

`bach_size` - Batch size

`nepochs` - Number of epochs to run for

`lowercase` - Do you want to lowercase the text

`class_weights` - Pytorch LSTM parameter for unbalanced classes

`ignore_cpos` - The position of the entity will be ignored, do not use this.

`auto_save_model` - This will autosave the top performing epoch during the training process

In [19]:
# Example of how to change parameters
mc.config.model['input_size'] = 768
mc.config.model['hidden_size'] = 300

mc.config.train['nepochs'] = 55
mc.config.train['auto_save_model'] = True

### Train MetaCAT
To run the training we use the `train` method that allows us to specify:

`json_path`: Path to a MedCATtrainer export containing the meta_annotations we want to train for.


`save_dir_path`: (optional, defaults to `None`): In case we have auto_save_model (meaning during the training the best model will be saved) we need to set a save path.


In [20]:
mc.train(json_path= DATA_DIR+"MedCAT_Export.json", save_dir_path='status')
# Saving the model this way will only save the model epoch with the best performance



Epoch: 0 ************************************************** Train
 precision recall f1-score support

 0 0.00 0.00 0.00 0
 1 0.44 0.46 0.45 113
 2 0.76 0.69 0.72 253

 accuracy 0.62 366
 macro avg 0.40 0.38 0.39 366
weighted avg 0.66 0.62 0.64 366

Epoch: 0 ************************************************** Test
 precision recall f1-score support

 1 1.00 0.13 0.24 15
 2 0.66 1.00 0.79 25

 accuracy 0.68 40
 macro avg 0.83 0.57 0.51 40
weighted avg 0.79 0.68 0.58 40



 _warn_prf(average, modifier, msg_start, len(result))
 _warn_prf(average, modifier, msg_start, len(result))
 _warn_prf(average, modifier, msg_start, len(result))



##### Model saved to status/model.dat at epoch: 0 and weighted avg/f1-score: 0.5842670401493931 #####

Epoch: 1 ************************************************** Train
 precision recall f1-score support

 1 1.00 0.17 0.29 113
 2 0.73 1.00 0.84 253

 accuracy 0.74 366
 macro avg 0.86 0.58 0.57 366
weighted avg 0.81 0.74 0.67 366

Epoch: 1 ************************************************** Test
 precision recall f1-score support

 1 1.00 0.20 0.33 15
 2 0.68 1.00 0.81 25

 accuracy 0.70 40
 macro avg 0.84 0.60 0.57 40
weighted avg 0.80 0.70 0.63 40


##### Model saved to status/model.dat at epoch: 1 and weighted avg/f1-score: 0.6290322580645161 #####

Epoch: 2 ************************************************** Train
 precision recall f1-score support

 1 1.00 0.33 0.49 113
 2 0.77 1.00 0.87 253

 accuracy 0.79 366
 macro avg 0.88 0.66 0.68 366
weighted avg 0.84 0.79 0.75 366

Epoch: 2 ************************************************** Test
 precision recall f1-score support

 1 0.90 0.

{'report': {'1': {'precision': 0.8571428571428571,
 'recall': 0.8,
 'f1-score': 0.8275862068965518,
 'support': 15},
 '2': {'precision': 0.8846153846153846,
 'recall': 0.92,
 'f1-score': 0.9019607843137256,
 'support': 25},
 'accuracy': 0.875,
 'macro avg': {'precision': 0.8708791208791209,
 'recall': 0.8600000000000001,
 'f1-score': 0.8647734956051387,
 'support': 40},
 'weighted avg': {'precision': 0.8743131868131867,
 'recall': 0.875,
 'f1-score': 0.8740703177822855,
 'support': 40}},
 'epoch': 10}

We can now save the models using the `save` function. It has only one argument `full_save` if `True` it will also save the embeddings and tokenizers (note that this is slightly redundant, as we no training was done on the embeddings/tokenizers). 

In [21]:
# Alternative way to save
mc.save("alternative_status_metamodel")

## Test of the whole pipeline

In [22]:
# Set filter
tui_filter = ['T047'] # Detect only diseases
cui_filters = set()
for tui in tui_filter:
 cui_filters.update(cdb.addl_info['type_id2cuis'][tui])
cdb.config.linking['filters']['cuis'] = cui_filters

# To add the meta-annotation model to the medcat pipeline
cat = CAT(cdb=cdb, config=config, vocab=vocab, meta_cats=[mc])

In [23]:
"C0035078" in cui_filters

True

In [24]:
text = "John Doe has epilepsy and hypertention but does not suffer from hypertension"
doc = cat(text)

In [25]:
for ent in doc.ents:
 print("Entity: " + ent.text)
 print("Meta Annotations: " + str(ent._.meta_anns))
 print("\n")

Entity: epilepsy
Meta Annotations: {'Status': {'value': 'Confirmed', 'confidence': 0.8530008792877197, 'name': 'Status'}}


Entity: hypertention
Meta Annotations: {'Status': {'value': 'Confirmed', 'confidence': 0.946444034576416, 'name': 'Status'}}


Entity: hypertension
Meta Annotations: {'Status': {'value': 'Other', 'confidence': 0.5102694630622864, 'name': 'Status'}}




Notice how the medcat metamodel labels John Doe with entities: "epilepsy", "hypertension" as __"Confirmed"__.

Whilst "suffer" and "hypertension" is labelled as __"Other"__.

This is extremely useful when conducting a context-based extract of concepts from text.

End of Tutorial