In [None]:
BASE_PATH = '/home/wish/'

In [None]:
import sys
sys.path.insert(0, BASE_PATH + "projects/MedCAT/")

%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import json
import pandas as pd
import numpy as np
import os

import datasets
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

from medcat.datasets import medcat_ner
from medcat.datasets.tokenizer_ner import TokenizerNER
from medcat.datasets.data_collator import CollateAndPadNER

from medcat.cdb import CDB
from medcat.vocab import Vocab
from medcat.cat import CAT

In [None]:
DATA_PATH = BASE_PATH + "data/medmentions/medmentions.json"

In [None]:
data = json.load(open(DATA_PATH))

In [None]:
cnts = {}
for p in data['projects']:
    for d in p['documents']:
        for a in d['annotations']:
            cnts[a['cui']] = cnts.get(a['cui'], 0) + 1

In [None]:
cnts

In [None]:
# Get the CDB (If you do not have the mimic CDB, please use the MedMentions available in the medcat repository)
#cdb = CDB.load(BASE_PATH + "data/models/cdb_mimic_md_21-April-2021.dat")
cdb = CDB.load("/home/wish/data/medcat_paper/cdb_mimic.dat")

In [None]:
cdb.config.general['spacy_model'] = 'en_core_sci_md'

In [None]:
# Get the vocab
vocab = Vocab.load(BASE_PATH + "data/vocabs/vocab.dat")

In [None]:
cdb.config.ner['upper_case_limit_len'] = 2
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)

### Subset to frequent concepts - for testing

In [None]:
MIN_FREQ = 300
for p in data['projects']:
    for d in p['documents']:
        anns = []
        for a in d['annotations']:
            if cnts[a['cui']] > MIN_FREQ:
                anns.append(a)
        d['annotations'] = anns

In [None]:
DATA_PATH = BASE_PATH + "data/medmentions/medmentions_only_above_300.json"
json.dump(data, open(DATA_PATH, 'w'))

In [None]:
dataset = datasets.load_dataset(os.path.abspath(medcat_ner.__file__), 
                                data_files=DATA_PATH, 
                                split=datasets.Split.TRAIN,
                                cache_dir='/tmp/')

In [None]:
dataset

In [None]:
hf_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
id2type = {}
for i in range(hf_tokenizer.vocab_size):
    id2type[i] = 'sub' if hf_tokenizer.convert_ids_to_tokens(i).startswith("##") else 'start'
tokenizer = TokenizerNER(hf_tokenizer, id2type=id2type)

In [None]:
encoded_dataset = dataset.map(
        lambda examples: tokenizer.encode(examples, ignore_subwords=True),
        batched=True,
        remove_columns=['ent_cuis', 'ent_ends', 'ent_starts', 'text'])

In [None]:
encoded_dataset

In [None]:
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=len(tokenizer.label_map))

In [None]:
encoded_dataset = encoded_dataset.train_test_split(test_size = 0.2)

In [None]:
collate_fn = CollateAndPadNER(hf_tokenizer.pad_token_id)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    weight_decay=0.1431478776404838,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,
    eval_steps=500,
    warmup_steps=15,
    learning_rate= 4.4670352057797207e-05,
    eval_accumulation_steps=1,
    do_eval=True,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
)

In [None]:
from sklearn.metrics import classification_report
def metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    # Ignore predictions where label == -100, padding
    preds[np.where(p.label_ids == -100)] = -100
    print(classification_report(np.reshape(p.label_ids, -1), np.reshape(preds, -1)))
    return {'none': 0}

In [None]:
trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=encoded_dataset['train'],       
    eval_dataset=encoded_dataset['test'],     
    compute_metrics=metrics,
    data_collator=collate_fn,
    tokenizer=None
)

In [None]:
trainer.train()

In [None]:
p = trainer.predict(encoded_dataset['test'])

In [None]:
preds = np.argmax(p.predictions, axis=2)

In [None]:
# Ignore predictions where label == -100, padding
preds[np.where(p.label_ids == -100)] = -100
report = classification_report(np.reshape(p.label_ids, -1), np.reshape(preds, -1), output_dict=True)

In [None]:
r_label_map = {v:k for k,v in tokenizer.label_map.items()}
for key in report.keys():
    if key.isdigit():
        cui = r_label_map.get(int(key), key)
    else:
        cui = key
    
    name = cdb.get_name(cui)
    print(name)
    print(report[key])
    print()

# Anonymise a document

In [None]:
from medcat.utils.deid import deid_document

In [None]:
text = dataset[12]['text']

In [None]:
new_text = deid_document(text, tokenizer, model, verbose=True)

In [None]:
new_text

## Test MedCAT on the same data

In [None]:
data = json.load(open(DATA_PATH))

In [None]:
cnts = {}
for p in data['projects']:
    for d in p['documents']:
        for a in d['annotations']:
            cnts[a['cui']] = cnts.get(a['cui'], 0) + 1
cnts

In [None]:
cat.config.linking['filters']['cuis'] = set(cnts.keys())
cat.config.linking['similarity_threshold'] = 0.25

In [None]:
# Print stats before training
fp, fn, tp, p, r, f1, cui_counts, examples = cat._print_stats(data)

In [None]:
np.average(list(f1.values()))

In [None]:
np.average(list(p.values()))

In [None]:
np.average(list(r.values()))

In [None]:
fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=DATA_PATH, print_stats=1, nepochs=5, test_size=0.2, devalue_others=True,
                         train_from_false_positives=True)

In [None]:
np.average(list(f1.values()))

In [None]:
np.average(list(p.values()))

In [None]:
np.average(list(r.values()))