In [None]:
import os
os.environ['MKL_NUM_THREAD'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from tokenizers import ByteLevelBPETokenizer

import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Paths and Config

In [None]:
data_dir = './data/'

data_path = os.path.join(data_dir, "<data_file>")  # Add your data file here
doc_id_column = "id"
doc_text_column = "description"

model_dir = '../../models/'

modelpack = ''  # enter your model here. Should the the output of trained 'output_modelpack'.
model_pack_path = os.path.join(model_dir, modelpack)

filter_path = None

ann_folder_path = os.path.join(data_dir, f'annotated_docs')
if not os.path.exists(ann_folder_path):
    os.makedirs(ann_folder_path)
    
save_path_annotations_per_doc = os.path.join(ann_folder_path, "<output_filename>.json")


# Load MedCAT model

In [None]:
# Create CAT - the main class from medcat used for concept annotation
cat = CAT.load_model_pack(model_pack_path)

# Annotate

In [None]:
# Set snomed filter if needed
# This is a white list filter of concepts
snomed_filter = json.load(open(snomed_filter_path))
cat.config.linking['filters']['cuis'] = snomed_filter 


In [None]:
cat.cdb.print_stats()

In [None]:
df = pd.read_csv(data_path)[[doc_id_column, doc_text_column]]  # Not necessary to filter at this step. But this loads only what is required



In [None]:
# Create generator object
def data_iterator(data, doc_name, doc_text):
    for id, row in data.iterrows():
        yeild (row[doc_name], row[doc_text])

In [None]:
batch_char_size = 500000  # Batch size (BS) in number of characters
results = cat.multiprocessing(data_iterator(df, doc_id_column, doc_text_column),
                              batch_char_size=batch_char_size,
                              nproc=8)  # Number of processors 


In [None]:
# Save to file (docs is docs 2 annotations)
json.dump(results, open(save_path_annotations_per_doc, "w"))

### Inspect the model

In [None]:
text = "He was diagnosed with heart failure"
doc = cat(text)
print(doc.ents)

In [None]:
# Display Snomed codes
for ent in doc.ents:
    print(ent, " - ", ent._.cui, " - ", cat.cdb.cui2preferred_name[ent._.cui])

In [None]:
# To show semantic types for each entity
for ent in doc.ents:
    print(ent, " - ", cat.cdb.cui2type_ids.get(ent._.cui))

In [None]:
# Display
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

# Alternative approach

In [None]:
# This approach does not use multiprocessing. But iterates line by line through your dataset.

docs = {}
print(f"Len of df: {len(df)}") 

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    text = str(row[doc_text_column])
    
    # Skip text if under 10 characters,
    if len(text) > 10:
        docs[row[doc_id_column]] = cat.get_entities(text)
    else:
        docs[row[doc_id_column]] = []

In [None]:
cat.cdb.print_stats()

In [None]:
# Save to file (docs is docs 2 annotations)
json.dump(docs, open(save_path_annotations_per_doc, "w"))
