In [1]:
import json
import os
from datetime import date
from medcat.cat import CAT
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT
from tokenizers import ByteLevelBPETokenizer

In [2]:
# if you want to enable info level logging
import logging
logging.basicConfig(level=logging.INFO,force=True)

# Set parameters

In [3]:
# relative path to working_with_cogstack folder
_rel_path = os.path.join("..", "..", "..")
# absolute path to working_with_cogstack folder
base_path = os.path.abspath(_rel_path)
# Load mct export
ann_dir = os.path.join(base_path, "data", "medcattrainer_export")

mctrainer_export_path = ann_dir + ""  # name of your mct export

# Load model
model_dir = os.path.join(base_path, "models", "modelpack")
modelpack = '' # name of modelpack
model_pack_path = os.path.join(model_dir, modelpack)
     #output_modelpack = model_dir + f"{today}_trained_model"

# will be used to date the trained model
today = str(date.today())
today = today.replace("-","")

# Initialise meta_ann models
if model_pack_path[-4:] == '.zip':
    base_dir_meta_models = model_pack_path[:-4]
else:
    base_dir_meta_models = model_pack_path

# Iterate through the meta_models contained in the model
meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export
for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):
    for dirname in dirnames:
        if dirname.startswith('meta_'):
            meta_model_names.append(dirname[5:])

Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.



# For LSTM model

In [None]:
for meta_model in meta_model_names:
    vocab_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,'bbpe-vocab.json')
    merges_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,'bbpe-merges.txt')
    tokenizer = TokenizerWrapperBPE(ByteLevelBPETokenizer(vocab=vocab_file,
                                    merges=merges_file,
                                    lowercase=True))
    # load and sort out the config
    config_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,"config.json")
    with open(config_file, 'r') as jfile:
        config_dict = json.load(jfile)
    config = ConfigMetaCAT()
    for key, value in config_dict.items():
        setattr(config, key, value['py/state']['__dict__'])
        
    save_dir_path= "test_meta_"+meta_model # Where to save the meta_model and results. 
    #Ideally this should replace the meta_models inside the modelpack

    # Initialise and train meta_model
    mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

# For BERT model

In [None]:
for meta_model in meta_model_names:
    # load and sort out the config
    config_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,"config.json")
    with open(config_file, 'r') as jfile:
        config_dict = json.load(jfile)
    config = ConfigMetaCAT()
    for key, value in config_dict.items():
        setattr(config, key, value['py/state']['__dict__'])

    tokenizer = TokenizerWrapperBERT.load(os.path.join(base_dir_meta_models,"meta_"+meta_model), 
                                          config.model['model_variant'])
    
    # change model name if training BERT for the first time
    config.model['model_name'] = 'bert'
    
    save_dir_path= "test_meta_"+meta_model # Where to save the meta_model and results. 
    #Ideally this should replace the meta_models inside the modelpack

    # Initialise and train meta_model
    mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

## If you dont have the model packs, and are training from scratch

In [None]:
config = ConfigMetaCAT()
# make sure to change the following parameters:
# config.model['nclasses']
# config.general['category_name']

# change model name if training BERT for the first time
config.model['model_name'] = 'bert'

tokenizer = TokenizerWrapperBERT.load("", config.model['model_variant'])

save_dir_path= "test_meta" # Where to save the meta_model and results. 
#Ideally this should replace the meta_models inside the modelpack

# Initialise and train meta_model
mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)