## Task: Install / Setup MedCATtrainer

In [None]:
# Only one person on each host needs to do this...
!git clone https://github.com/CogStack/MedCATtrainer.git
!cd MedCATTrainer
!export MCTRAINER_PORT=8005   # Optionally set a port for the trainer, default is 8001
!docker-compose up

## MedCATtrainer
- Admin UI
 - Create projects
 - Upload Models etc.
- User UI Walkthrough
 - Navigate documents
 - Annotate / validate documents

## Annotate Documents for T047,T048 semantic types
Use the dataset exported from the previous notebook (data/train_data.csv). You can download to your local machine from jupyter by using the explorer view (http://localhost:8000/notebooks/tmp/medcat-cogstack-workshop)

Once you've annoated some documents

## Export the Annotations Using the Trainer API
- Explore what's available in the donwloaded dataset

In [108]:
URL = "http://localhost:8001"

In [109]:
payload = {"username": "admin", "password": "admin"}
headers = {
    'Authorization': f'Token {json.loads(requests.post("http://localhost:8001/api/api-token-auth/", json=payload).text)["token"]}',
}
headers

{'Authorization': 'Token affd97a49175d8ddd7da99216a8ed251506acf9c'}

In [110]:
[(f"Project Name:{p['name']}",f"id:{p['id']}") for p in json.loads(requests.get(f'{URL}/api/project-annotate-entities/', headers=headers).text)['results']]

[('Project Name:Example Annotation Project - UMLS (Diseases / Symptoms / Findings)',
  'id:1'),
 ('Project Name:Cogstack Training Project', 'id:2')]

In [111]:
# Fill in your project ids here. e.g:
project_ids_to_download = [str(1),str(2)]

In [114]:
with_text = True
download_url = f'{URL}/api/download-annos/?project_ids={ ",".join(project_ids_to_download)}'
if with_text:
    download_url += '&with_text=True'
projects = json.loads(requests.get(download_url, headers=headers).text)

In [117]:
# write output to disk
username = '## YOUR USERNAME ##'
username = 'searlt'
json.dump(projects, open(f'data/MedCATtrainer_export_{username}.json', 'w'))

## Re-train MedCAT NER+L
- instantiate MedCAT
- train supevised using your annotations

In [121]:
cdb_path = 'data/cdb-medmen.dat'
vocab_path = 'data/vocab.dat'
cdb = CDB()
cdb.load_dict(cdb_path)
vocab = Vocab()
vocab.load_dict(vocab_path)
cat = CAT(cdb, vocab)

In [None]:
cat.train_supervised(data_path=f"data/MedCATtrainer_export_{username}.json", 
                     nepochs=1,
                     lr=0.1,
                     anneal=False, # Unless we are reseting the CDB or cui_count this is False
                     print_stats=True, 
                     use_filters=True)

## Run the re-trained model on the wider dataset
- Re-use the code from the 3.0 Notebook to extract dieseases for each doc
- How do the reuslts differ?

Things that still might be of interst (not covered here)
- Cross Validation - train/test splits
- Plotting learning curves

## Bonus: Train MetaCAT Models
- The meta annotation modes you defined (Temporality / Status)

In [127]:
from medcat.meta_cat import MetaCAT
from tokenizers import ByteLevelBPETokenizer
from itertools import chain
import numpy as np

In [None]:
# download pre-trained tokeniser models
# meta anno model: tokeniser models trained via the github MedCAT tutorials
# https://colab.research.google.com/drive/1rxzBZCTDcqsIjRXZ3u4yRZFOkUCCuwyy
!wget https://zkcl.s3-eu-west-1.amazonaws.com/embeddings.npy -P ./data/
!wget https://zkcl.s3-eu-west-1.amazonaws.com/medmen-merges.txt -P ./data/
!wget https://zkcl.s3-eu-west-1.amazonaws.com/medmen-vocab.json -P ./data/

# Get MedCAT models
!wget https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat -P ./data/
!wget https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat -P ./data/

In [128]:
# Tokenizer instantiation
tokenizer = ByteLevelBPETokenizer(vocab_file='data/medmen-vocab.json', merges_file='data/medmen-merges.txt')
embeddings = np.load(open('data/embeddings.npy', 'rb'))

In [130]:
metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, 
                  pad_id=len(embeddings) -1, save_dir='mc_status', device='cpu')

proj = json.load(open(f'data/MedCATtrainer_export_{username}.json'))

In [131]:
# Tasks we are training for...
set(chain.from_iterable([a['meta_anns'].keys() for d in proj['projects'][0]['documents'] for a in d['annotations']]))

set()

In [None]:
# Pick one of the above tasks 
metacat.train(f'data/MedCATtrainer_export_{username}.json', 'Status', nepochs=5)