In [None]:
import json
import os
from datetime import date
from medcat.cat import CAT
from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon
from medcat.config.config_meta_cat import ConfigMetaCAT
from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import TokenizerWrapperBPE
from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT
from medcat.utils.legacy.identifier import is_legacy_model_pack
from medcat.storage.serialisers import deserialise
from medcat.tokenizing.tokenizers import create_tokenizer
from tokenizers import ByteLevelBPETokenizer

In [None]:
# if you want to enable info level logging
import logging
logging.basicConfig(level=logging.INFO,force=True)

# Set parameters

In [None]:
# relative path to working_with_cogstack folder
_rel_path = os.path.join("..", "..", "..")
# absolute path to working_with_cogstack folder
base_path = os.path.abspath(_rel_path)
# Load mct export
ann_dir = os.path.join(base_path, "data", "medcattrainer_export")

mctrainer_export_path = ann_dir + ""  # name of your mct export

# Load model
model_dir = os.path.join(base_path, "models", "modelpack")
modelpack = '' # name of modelpack
model_pack_path = os.path.join(model_dir, modelpack)
     #output_modelpack = model_dir + f"{today}_trained_model"

# will be used to date the trained model
today = str(date.today())
today = today.replace("-","")

# Initialise meta_ann models
if model_pack_path[-4:] == '.zip':
    base_dir_meta_models = model_pack_path[:-4]
else:
    base_dir_meta_models = model_pack_path

# Iterate through the meta_models contained in the model
meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export
model_is_legacy = is_legacy_model_pack(base_dir_meta_models)
if model_is_legacy:
    # NOTE: when loaded, will be auto-converted
    exp_start = "meta_"
    config_path = ["config.json"]
else:
    exp_start = "addon_meta_cat"
    base_dir_meta_models = os.path.join(base_dir_meta_models, "saved_components")
    config_path = ["meta_cat", "config"]
for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):
    for dirname in dirnames:
        if dirname.startswith(exp_start):
            meta_model_names.append(dirname[len(exp_start):])

Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.



Depending on the model pack you have, please run the LSTM model or BERT model section. <br>
If you are unsure, use this section to check the model type.

In [None]:
for meta_model in meta_model_names:
    config_path = os.path.join(base_dir_meta_models, exp_start + meta_model, *config_path)
    if model_is_legacy:
        with open(config_path, 'r') as jfile:
            config_dict = json.load(jfile)
        print(f"Model used for meta_{meta_model}:", config_dict['model']['model_name'])
    else:
        cnf: ConfigMetaCAT = deserialise(config_path)
        print(f"Model used for meta_{meta_model}:", config_dict.model.model_name)

# For LSTM and BERT model

In [None]:
# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths
#       to the relevant Entity/Span and Document implementation
#       we'll use the regex tokenizer here for example since it's easier to initialise
#       but you can use a spacy-based one, you just need to also pass:
#       - the model name (e.g 'en_core_web_md')
#       - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',
#           'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])
#       - whether diacritics should be used
#       - max document length (e.g 1_000_000)
base_tokenizer = create_tokenizer("regex")
for meta_model in meta_model_names:
    meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)
    if model_is_legacy:
        from medcat.utils.legacy.convert_meta_cat import get_meta_cat_from_old
        meta_cat: MetaCATAddon = get_meta_cat_from_old(meta_cat_path, base_tokenizer)
    else:
        # NOTE: the expected workflow when loading the model
        #       is one where the config is stored as part of the overall config
        #       and thus using it for loading is trivial
        #       but here we need to manually load the config from disk
        cnf_path = os.path.join(meta_cat_path, "config")
        cnf: ConfigMetaCAT = deserialise(cnf_path)
        # load the meta_model
        meta_cat = MetaCATAddon.load_existing(cnf, base_tokenizer, os.path.join(base_dir_meta_models, exp_start + meta_model))
    mc = meta_cat.mc

    # changing parameters
    mc.config.train.nepochs = 15

    save_dir_path= "test_meta_"+meta_model # Where to save the meta_model and results. 
    #Ideally this should replace the meta_models inside the modelpack

    # train the meta_model
    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)
    
    # Save results
    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))

## If you dont have the model packs, and are training from scratch

In [None]:
config = ConfigMetaCAT()
# make sure to change the following parameters:
# config.model['nclasses']
# config.general['category_name']

# change model name if training BERT for the first time
config.model.model_name = 'bert'

save_dir_path= "test_meta" # Where to save the meta_model and results. 
#Ideally this should replace the meta_models inside the modelpack

# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths
#       to the relevant Entity/Span and Document implementation
#       we'll use the regex tokenizer here for example since it's easier to initialise
#       but you can use a spacy-based one, you just need to also pass:
#       - the model name (e.g 'en_core_web_md')
#       - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',
#           'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])
#       - whether diacritics should be used
#       - max document length (e.g 1_000_000)
base_tokenizer = create_tokenizer("regex")

# Initialise and train meta_model
mc = MetaCATAddon.create_new(config, base_tokenizer)
results = mc.mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

# Save results
json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))