In [None]:
# Based off of https://www.kaggle.com/code/finalepoch/medical-ner-using-spacy/notebook
# Used Kaggle NER training data from https://www.kaggle.com/datasets/finalepoch/medical-ner

In [163]:
import json
with open("Corona2.json") as f:
    annotation = json.load(f)

In [164]:
TRAIN_DATA  = []
for e in annotation["examples"]:
    content = e["content"]
    entities = []
    for an in e["annotations"]:        
        if len(an["value"]) == len(an["value"].strip()):          
            if len(an['human_annotations']) == 0:
                continue
            info = (an["start"],an["end"],an["tag_name"])
            entities.append(info)
            #print(an["start"],an["end"],an["tag_name"])
    if len(entities) > 0:
        TRAIN_DATA.append(([content,{"entities":entities}])) 

In [165]:
from __future__ import unicode_literals, print_function
from spacy.training import Example
import random
from pathlib import Path
from spacy.util import minibatch, compounding
import spacy
import sys

In [166]:
def train_model(model=None, output_dir="medical-ner", n_iter=1000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    if "ner" not in nlp.pipe_names:
        # ner = nlp.create_pipe("ner")
        ner = nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 64.0, 1.2))
            for batch in batches:
                for text, annotations in batch:
                    # create Example
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    # Update the model
                    nlp.update([example], losses=losses, drop=0.3)
            print("Losses", losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [167]:
train_model()

Created blank 'en' model
Losses {'ner': 1380.881360591498}
Losses {'ner': 418.4219550736255}
Losses {'ner': 364.65683555653874}
Losses {'ner': 416.88413357305933}
Losses {'ner': 295.6486300994761}
Losses {'ner': 303.9262292908744}
Losses {'ner': 259.23501064110076}
Losses {'ner': 417.892643926387}
Losses {'ner': 217.00161839981104}
Losses {'ner': 168.73166043619582}
Losses {'ner': 172.1254223142889}
Losses {'ner': 160.8461420117716}
Losses {'ner': 184.37519378984038}
Losses {'ner': 147.4706719645223}
Losses {'ner': 121.72896785945765}
Losses {'ner': 153.06502626228874}
Losses {'ner': 108.06463072879335}
Losses {'ner': 96.90231992506243}
Losses {'ner': 107.17515659136588}
Losses {'ner': 92.66783222367708}
Losses {'ner': 89.46968337162352}
Losses {'ner': 84.97027627827188}
Losses {'ner': 89.56852492587171}
Losses {'ner': 75.93285128534477}
Losses {'ner': 77.76256390159321}
Losses {'ner': 70.43855782122176}
Losses {'ner': 58.90694831115776}
Losses {'ner': 72.44141201475774}
Losses {'ner':