### CR with Spacy Example

In [1]:
import json, logging, sys
import pandas as pd

In [2]:
ner_file = "spaCy_data/se_ner_annotated.tsv"
df_data = pd.read_csv(ner_file,sep="\t",encoding="latin1").fillna(method='ffill')

In [3]:
df_data = df_data[['Word', 'Tag']]
df_data.to_csv('spacy_ner.tsv', sep='\t',index=False)

In [4]:
# Convert .tsv file to dataturks json format. 

def tsv_to_json_format(input_path,output_path,unknown_label):
    
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format("spacy_ner.tsv",'spaCy_data/se_ner_spacy.json','abc')

In [None]:
# Convert json file to spaCy format.
import plac
import logging
import argparse
import sys
import os
import json
import pickle

#@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str))

def to_spacy_format(input_file=None, output_file=None):
    try:
        training_data = []
        lines=[]
        with open(input_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        print(training_data)

        with open(output_file, 'wb') as fp:
            pickle.dump(training_data, fp)

    except Exception as e:
        logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e))
        return None

to_spacy_format('spaCy_data/se_ner_spacy.json', 'spaCy_data/se_ner_spacy_new.json')




In [None]:
import spacy
nlp = spacy.blank('en')  # create blank Language class
ner = nlp.create_pipe('ner')

In [None]:
list(df_data.Tag.unique())

['O',
 'B-syscon',
 'B-grp',
 'B-seterm',
 'B-opcon',
 'I-opcon',
 'B-mea',
 'I-mea',
 'B-loc',
 'I-loc',
 'B-abb',
 'I-grp',
 'I-syscon',
 'B-cardinal',
 'B-org',
 'I-org',
 'B-event',
 'I-event',
 'I-seterm',
 'I-abb',
 'B-art']

In [None]:
#!/usr/bin/env python
# coding: utf8

# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# New entity labels
# Specify the new entity labels which you want to add here
LABEL = list(df_data.Tag.unique())

# Loading training data 
with open ('spaCy_data/se_ner_spacy_new.json', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

FULL_DATA = TRAIN_DATA
num_of_examples = len(FULL_DATA)
print(num_of_examples)

3606


In [None]:
TEST_DATA = FULL_DATA[int(num_of_examples*0.8):]
TRAIN_DATA = FULL_DATA[:int(num_of_examples*0.8)]

In [None]:
def train_2(model=None, new_model_name='cr', output_dir="spaCy_data", n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    """ADD MULTIPLE LABELS TO NER MODEL"""
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)
            
    # test the trained model
    test_text = 'Acceptable Risk is the risk that is understood and agreed to by the program/project.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for i, ent in enumerate(doc.ents):
        print("Entity number %s is %s in text: '%s'" % (i, ent.label_, ent.text))

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [None]:
train_2()

Created blank 'en' model
{'ner': 9072.742579937174}
{'ner': 5180.43596956917}
{'ner': 4508.110472343519}
{'ner': 4078.040244156246}
{'ner': 3759.650847261542}
{'ner': 3602.6802451707213}


In [None]:
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

ner_model = spacy.load('spaCy_data') # for spaCy's pretrained use 'en_core_web_sm'
results = evaluate(ner_model, TEST_DATA)
print(results)

### References: 

https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718
https://timkuhn.github.io/TextMining/spacy/ner/2018/01/24/spaCy_NER_Training.html