In [None]:
# Install medcat
! pip install medcat==1.2.3
# Get the spacy model
! python -m spacy download en_core_web_md

Collecting medcat
  Downloading medcat-1.2.3-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 7.7 MB/s 
[?25hCollecting elasticsearch>=7.10
  Downloading elasticsearch-7.15.1-py2.py3-none-any.whl (378 kB)
[K     |████████████████████████████████| 378 kB 43.7 MB/s 
Collecting scipy<=1.7.1,>=1.5.4
  Downloading scipy-1.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (28.5 MB)
[K     |████████████████████████████████| 28.5 MB 49 kB/s 
[?25hCollecting transformers~=4.11.3
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 42.2 MB/s 
Collecting spacy<3.1.4,>=3.0.1
  Downloading spacy-3.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 46.9 MB/s 
[?25hCollecting jsonpickle~=2.0.0
  Downloading jsonpickle-2.0.0-py2.py3-none-any.whl (37 kB)
Collecting datasets~=1.14.0
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)


**Restart the runtime if on colab, sometimes necessary after installing models**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import json 

from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT
from tokenizers import ByteLevelBPETokenizer


In [None]:
DATA_DIR = "./data/"
vocab_path = DATA_DIR + "vocab.dat"
cdb_path = DATA_DIR + "cdb-medmen-v1_2.dat"

In [None]:
# Download the models and required data
!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/MedCAT_Export.json -P ./data/
# You can also use the models created in Part 4.1 of the Tutorial
!wget https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip -P ./data/

# Get MedCAT models components (Alternatively you can use a previously created MedCAT model packs)
!wget https://medcat.rosalind.kcl.ac.uk/media/vocab.dat -P ./data/
!wget https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1_2.dat -P ./data/

--2021-10-25 13:08:19--  https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/MedCAT_Export.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272538 (266K) [text/plain]
Saving to: ‘./data/MedCAT_Export.json’


2021-10-25 13:08:20 (13.0 MB/s) - ‘./data/MedCAT_Export.json’ saved [272538/272538]

--2021-10-25 13:08:20--  https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip
Resolving medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)... 193.61.202.225
Connecting to medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)|193.61.202.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37947480 (36M) [application/zip]
Saving to: ‘./data/mc_status.zip’


2021-10-25 13:08:24 (13.1 MB/s) - ‘./data/mc_status.zip’ saved [3

## MedCATtrainer Export

In [None]:
#@title
data = json.load(open(DATA_DIR + "MedCAT_Export.json"))

In [None]:
#@title
print(data.keys())

dict_keys(['projects'])


In [None]:
#@title
data['projects'][0].keys()

dict_keys(['name', 'id', 'cuis', 'tuis', 'documents'])

In [None]:
#@title
data['projects'][0]['documents'][0].keys()

dict_keys(['id', 'name', 'text', 'last_modified', 'annotations'])

In [None]:
#@title
data['projects'][0]['documents'][0]['annotations'][0].keys()

dict_keys(['id', 'user', 'cui', 'value', 'start', 'end', 'validated', 'correct', 'deleted', 'alternative', 'killed', 'last_modified', 'manually_created', 'acc', 'meta_anns'])

In [None]:
#@title
data['projects'][0]['documents'][0]['annotations'][0]['meta_anns'][0].keys()

dict_keys(['name', 'value', 'acc', 'validated'])

##Fine-tuning the NER+L model

First we load the existing MedCAT models that we will fine-tune.



In [None]:
# Create and load the CDB (Concept Database)
cdb = CDB.load(cdb_path)

# Create and load the Vocabulary
vocab = Vocab.load(vocab_path)

# Setup config
config = Config()
config.general['spacy_model'] = 'en_core_web_md'

# Create CAT - the main class from medcat used for concept annotation
cat = CAT(cdb=cdb, config=config, vocab=vocab)

To fine-tune the existing models we use the `train_supervised()` method from MedCAT. The method has the following options:


---



`data_path` - Path to the JSON file exported from MedCATtrainer

`reset_cui_count` - Each cui has an internal counter that is used for weight decay, this will reset it.

`nepochs` - Numeber of epochs

`reset_cui_count` - Used for training with weight_decay (annealing). Each concept has a count that is there from the beginning of the CDB, that count is used for annealing. Resetting the count will significantly increase the training impact. This will reset the count only for concepts that exist in the the training data.

`print_stats` - Print statistics during training (F1/P/R)

`test_set` - Provide another file for testing

`use_filters` - During the training do you want to set the cui/tui filters from the medcattrainer project configuration. 

`never_terminate` - If True no termination of concepts will be applied
          
`terminate_last` - If true, concept termination will be done after all training.

`use_groups` - If True concepts that have groups will be combined and stats will be reported on groups.

`use_overlaps` - Allow overlapping entites, nearly always False as it is very difficult to annotate overlapping entites.
            
`use_cui_doc_limit` - If True the metrics for a CUI will be only calculated if that CUI appears in a document.
            
`train_from_false_positives` - If True it will use false positive examples detected by medcat and train from them as negative examples.



In [None]:
cat.train_supervised(data_path=DATA_DIR + "MedCAT_Export.json", 
                     nepochs=1,
                     reset_cui_count=False,
                     print_stats=True, 
                     use_filters=True) 

In [None]:
# If we want to know the F1, P, R for each cui, we can call the stats method
data = json.load(open(DATA_DIR + "MedCAT_Export.json"))
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, extra_cui_filter=True)

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/27 [00:00<?, ?it/s]

Epoch: 0, Prec: 0.0678230364451374, Rec: 0.8635235732009926, F1: 0.12576797976147452

Docs with false positives: 1070; 899; 1737; 605; 611; 716; 1734; 1687; 1694; 323

Docs with false negatives: 1070; 1737; 611; 716; 1694; 323; 1383; 73; 688; 496



False Positives

Patients                                                               - C0030705             -        199
Normal                                                                 - C0205307             -         71
Medical History                                                        - C0262926             -         59
Daily                                                                  - C0332173             -         47
Bilateral                                                              - C0238767             -         43
year                                                                   - C0439234             -         41
Right                                                                  - C0205090          

In [None]:
# Now we can check F1/P/R for a cui using
cui = "C0020538" # Hypertension
print(cui_f1[cui], cui_prec[cui], cui_rec[cui])

0.945054945054945 0.8958333333333334 1.0


In [None]:
# inspect fn, fp examples for specific CUIs of interest...
examples['fn']['C0010068']

[{'acc': 1,
  'cui': 'C0010068',
  'document inedex': 1,
  'project index': 0,
  'source value': 'coronary artery disease',
  'text': ' appendectomy, oophorectomy.,FAMILY HISTORY: , Positive for coronary artery disease in her father and brother in their 40s.,SOCIAL HISTORY: , S'},
 {'acc': 1,
  'cui': 'C0010068',
  'document inedex': 25,
  'project index': 0,
  'source value': 'coronary artery disease',
  'text': 'on fraction of 20%-25% in December 2005, COPD, mild diffuse coronary artery disease, and renal insufficiency.,ALLERGIES:, NO KNOWN DRUG ALLERGI'}]

In [None]:
# annotations inside first projects...
[anno for d in data['projects'][0]['documents'] for anno in d['annotations']][0:5]

##MetaAnnotations

During the annotation process we have created only one meta-annotation called "Status". Here we are going to train a BiLSTM to detect that meta-annotation.

### MetaCAT

The class we are going to use to train meta-annotations is called MetaCAT. As input it takes:

`tokenizer` - A BBPE tokenizer from [huggingface](https://github.com/huggingface/tokenizers)

`embeddings` - The precalculated embeddings for the tokens produced by the tokenizer. Can be anything from Word2Vec to BERT. This is a numpy matrix, or python list of embeddings. 

`cntx_left` - Size of context from the left side of the entity that will be taken into account.

`cntx_right` - Size of context from the right side of the entity that will be taken into account.

`save_dir` - Where do we want to save the trained models.

`pad_id` - Padding index in the embeddings matrix. 

`device` - On which device to run this `cpu` or `cuda`

In [None]:
!unzip data/mc_status.zip

Archive:  data/mc_status.zip
   creating: Status/
  inflating: Status/model.dat        
  inflating: Status/config.json      
  inflating: Status/bbpe-vocab.json  
  inflating: Status/bbpe-merges.txt  


In [None]:
# Get the required tokenizer (note that we have already downloaded the required models)
mc = MetaCAT.load('Status')

### MetaCAT configuration

For a full list of all the configurable parameter. Follow this [link](https://github.com/CogStack/MedCAT/blob/master/medcat/config_meta_cat.py).

Some noteable parameters:

`category_name` - What is the name of this meta-annotation (same as the name in the MedCATtrainer)

`model_name` - for now only `lstm`

`lr` - Learning rate

`test_size` - Proportion of the test set

`bach_size` - Batch size

`nepochs` - Number of epochs to run for

`lowercase` - Do you want to lowercase the text

`class_weights` - Pytorch LSTM parameter for unbalanced classes

`ignore_cpos` - The position of the entity will be ignored, do not use this.

`auto_save_model` - This will autosave the top performing epoch during the training process

In [None]:
# Example of how to change parameters
mc.config.model['input_size'] = 768
mc.config.model['hidden_size'] = 300

mc.config.train['nepochs'] = 55
mc.config.train['auto_save_model'] = True

dict_keys(['model_name', 'num_layers', 'input_size', 'hidden_size', 'dropout', 'num_directions', 'nclasses', 'padding_idx', 'emb_grad', 'ignore_cpos'])

### Train MetaCAT
To run the training we use the `train` method that allows us to specify:

`json_path`: Path to a MedCATtrainer export containing the meta_annotations we want to train for.


`save_dir_path`: (optional, defaults to `None`): In case we have auto_save_model (meaning during the training the best model will be saved) we need to set a save path.


In [None]:
mc.train(json_path= DATA_DIR+"MedCAT_Export.json", save_dir_path='status')
# Saving the model this way will only save the model epoch with the best performance

We can now save the models using the `save` function. It has only one argument `full_save` if `True` it will also save the embeddings and tokenizers (note that this is slightly redundant, as we no training was done on the embeddings/tokenizers). 

In [None]:
# Alternative way to save
mc.save("alternative_status_metamodel")

## Test of the whole pipeline

In [None]:
# Set filter
tui_filter = ['T047'] # Detect only diseases
cui_filters = set()
for tui in tui_filter:
  cui_filters.update(cdb.addl_info['type_id2cuis'][tui])
cdb.config.linking['filters']['cuis'] = cui_filters

# To add the meta-annotation model to the medcat pipeline
cat = CAT(cdb=cdb, config=config, vocab=vocab, meta_cats=[mc])

In [None]:
"C0035078" in cui_filters

True

In [None]:
text = "John Doe has epilepsy and hypertention but does not suffer from hypertension"
doc = cat(text)

In [None]:
for ent in doc.ents:
  print("Entity: " + ent.text)
  print("Meta Annotations: " + str(ent._.meta_anns))
  print("\n")

Entity: epilepsy
Meta Annotations: {'Status': {'value': 'Confirmed', 'confidence': 0.9999226927757263, 'name': 'Status'}}


Entity: hypertention
Meta Annotations: {'Status': {'value': 'Confirmed', 'confidence': 0.9990968108177185, 'name': 'Status'}}


Entity: suffer
Meta Annotations: {'Status': {'value': 'Other', 'confidence': 0.9998617172241211, 'name': 'Status'}}


Entity: hypertension
Meta Annotations: {'Status': {'value': 'Other', 'confidence': 0.9998865127563477, 'name': 'Status'}}




Notice how the medcat metamodel labels John Doe with entities: "epilepsy", "hypertension" as __"Confirmed"__.

Whilst "suffer" and "hypertension" is labelled as __"Other"__.

This is extremely useful when conducting a context-based extract of concepts from text.

End of Tutorial