In [29]:
import json
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example


In [50]:
LABELS = ["FRAMEWORKS","PROG_LANG","DB","PLATFORM"]
TRAINING_DATA = []
VERIFICATION_DATA = []

In [51]:
def test_model(nlp, data):
    for item in data:
        doc = nlp(item)
        print('------------------------')
        print("Entities in '%s'" % item)
        for entity in doc.ents:
            print(entity.label_, entity.text)


In [7]:
def __preprocess_data(file_name):
    
    data = [json.loads(line) for line in open(file_name, 'r')]
    transformed_data = data[:40]
    
    random.shuffle(transformed_data)
    transformed_data_len = len(transformed_data)
    training_data_len = int(round(0.7 * transformed_data_len))

    for i in range(training_data_len):
        TRAINING_DATA.append(transformed_data[i])

    for i in range(training_data_len, transformed_data_len):
        VERIFICATION_DATA.append(transformed_data[i])                

In [10]:
__preprocess_data("data_after_doccano.jsonl")

In [36]:
len(TRAINING_DATA)

252

In [39]:
nlp = spacy.load("en_core_web_sm")


In [47]:
TRAINING_DATA

[{'id': 378,
  'text': "NTT DATA – part of NTT Group – is a trusted global innovator of IT and business services headquartered in Tokyo. We help clients transform through consulting, industry solutions, business process services, digital & IT modernization, and managed services. NTT DATA enables them, as well as society, to move confidently into the digital future. We are committed to our client's long-term success and combine global reach with local client attention to serve them in over 50 countries around the globe.NTT DATA is the 6th most valuable IT Company in the world dedicated to Consulting and Outsourcing services, with more than 139,000 professionals located in 50 countries.Based in the north of Morocco since 2016, NTT DATA is today a leader and the first IT employer in the region with more than 300 experts across the Kingdom.Our objective is to continue our growth reaching 1000 professionals by 2025.By joining us, you’ll be able to collaborate with large multinationals while

In [61]:
LABELS = ["FRAMEWORKS","PROG_LANG","DB","PLATFORM"]
TRAINING_DATA = []
VERIFICATION_DATA = []
def model_training_testing(model='en_core_web_sm', new_model_name='ner', output_dir='./model', n_iter=30):


    """Set up the pipeline and entity recognizer, and train the new entity."""
    
    random.seed(0)

    # Create training and verification datasets from annotated file content
    __preprocess_data("admin.jsonl")

    # Load existing spaCy model
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
        print("Created NER model")
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")
        print("Got NER model")

    # Add new labels to entity recognizer
    for label in LABELS:
        ner.add_label(label)

    # We assume an existing model modification
    optimizer = nlp.resume_training()

    move_names = list(ner.move_names)
    # Get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # Batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            losses = {}
            random.shuffle(TRAINING_DATA)
            for train in TRAINING_DATA:
              doc = nlp.make_doc(train["text"])
              example = Example.from_dict(doc, {"entities": train["label"]})
              nlp.update([example], sgd=optimizer, drop=0.2, losses=losses)
            print("Losses", losses)

    # Test the trained model via verification dataset
    test_model(nlp, [item["text"] for item in VERIFICATION_DATA])

    # Save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        test_model(nlp2, [item["text"] for item in VERIFICATION_DATA])


In [62]:
model_training_testing()

Loaded model 'en_core_web_sm'
Got NER model
Losses {'ner': 510.396685730518}
Losses {'ner': 452.33562485006337}
Losses {'ner': 188.2008633441289}
Losses {'ner': 182.07863125421395}
Losses {'ner': 137.65485908789486}
Losses {'ner': 150.7848796928488}
Losses {'ner': 165.78783287837595}
Losses {'ner': 137.6586895233064}
Losses {'ner': 101.37857452201949}
Losses {'ner': 110.74641801370176}
Losses {'ner': 97.29087288980779}
Losses {'ner': 80.81741993028965}
Losses {'ner': 238.54213086295684}
Losses {'ner': 39.88323071293224}
Losses {'ner': 44.78022935569012}
Losses {'ner': 40.45687625659695}
Losses {'ner': 30.408819510683195}
Losses {'ner': 30.18765437090482}
Losses {'ner': 28.233556403904526}
Losses {'ner': 26.460598274109717}
Losses {'ner': 29.105329828111792}
Losses {'ner': 23.333114005194588}
Losses {'ner': 23.928986569615013}
Losses {'ner': 27.315433741723865}
Losses {'ner': 15.12285279079867}
Losses {'ner': 15.393058568355142}
Losses {'ner': 14.39591259983696}
Losses {'ner': 12.245507