# Read Data

In [8]:
import json
with open('Resumes.json', 'r') as f:
    content = f.readlines()
entities = [json.loads(ent) for ent in content]
entities[0]

{'annotation': [{'label': ['Skills'],
   'points': [{'end': 1621,
     'start': 1295,
     'text': '\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'}]},
  {'label': ['Skills'],
   'points': [{'end': 1153,
     'start': 993,
     'text': 'C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),\nDatabase Management System (Less than 1 year), Java (Less than 1 year)'}]},
  {'label': ['College Name'],
   'points': [{'end': 956, 'start': 939, 'text': 'Kendriya Vidyalaya'}]},
  {'label': ['College Name'],
   'points': [{'end': 904, 'start': 883, 'text': 'Woodbine modern school'}]},
  {'label': ['Graduation Year'],
   'points': [{'end': 860, 'start': 856, 'text': '2017\

# formatting Data

In [9]:
def formatted_entities(entities):
    all_content = list()
    for ent in entities:
        text = ent['content']
        for points in ent['annotation']:
            if points['label']:
                label = points['label'][0]
                all_labels = list()
                if label in ['Years of Experience']:
                    for point in points['points']:
                        start = point['start']
                        end = point['end']
                        all_labels.append([start, end, label])
                if all_labels:
                    all_content.append([text, {'entities': all_labels}])
    return all_content
            

In [14]:
all_entities = formatted_entities(entities)
TRAIN_DATA = all_entities

# Model Training

In [None]:
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [19]:
def train_new_NER(model=None, output_dir='./', n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
            
# Finally train the model by calling above function
train_new_NER()


Created blank 'en' model

Losses {'ner': 17966.968251883984}
Losses {'ner': 23.4943203208176}
Losses {'ner': 8.00000000000056}
Losses {'ner': 7.999999761689498}
Losses {'ner': 7.999623494840164}
Losses {'ner': 7.477797263073761}
Losses {'ner': 6.1259036826882625}
Losses {'ner': 7.466712524281424}
Losses {'ner': 68.7964251057954}
Losses {'ner': 5.9250731515073936}
Losses {'ner': 60.21956814813694}
Losses {'ner': 69.48833667231018}
Losses {'ner': 6.8656166301142445}
Losses {'ner': 6.0142688929962}
Losses {'ner': 10.551918549925395}
Losses {'ner': 94.74899599216182}
Losses {'ner': 51.83855203194679}
Losses {'ner': 5.047740785780824}
Losses {'ner': 4.809749605035985}
Losses {'ner': 9.060285774337789}
Losses {'ner': 40.66991355981214}
Losses {'ner': 34.048893573366406}
Losses {'ner': 12.468541544741615}
Losses {'ner': 66.77463851598091}
Losses {'ner': 4.654174881680675}
Losses {'ner': 4.471883091090889}
Losses {'ner': 6.145820438793653}
Losses {'ner': 1.836381662681863}
Losses {'ner': 8.687

In [20]:
nlp2 = spacy.load('./')
doc2 = nlp2('i have more than eight years of experience in python')

for token in doc2:
    print(token, token.ent_type_)

i 
have 
more 
than 
eight 
years 
of 
experience 
in 
python 
