# Import mosaic and spacy libraries

In [1]:
from __future__ import unicode_literals, print_function
from mosaicml import *
from mosaicml.constants import MLModelFlavours
import json
import sys
import logging
import argparse
import os
import cloudpickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Creating Dataset

In [2]:
# Convert .tsv file to json format. 

def tsv_to_json_format(input_path,output_path,unknown_label):
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format("spacy/spacy.tsv",'spacy/spacy.json','abc')

In [3]:
# Convert json file to spaCy format.

def json_to_spacy(input_file=None, output_file=None):
    try:
        training_data = []
        lines=[]
        with open(input_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        print(training_data)

        with open(output_file, 'wb') as fp:
            cloudpickle.dump(training_data, fp)

    except Exception as e:
        logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e))
        return None
    
input_file="spacy/spacy.json"
output_file="spacy/traindata.json"  

json_to_spacy(input_file,output_file)

[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country ', {'entities': [(48, 54, 'B-geo'), (77, 81, 'B-geo'), (111, 118, 'B-gpe')]}), ('Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings ', {'entities': [(109, 113, 'B-per')]}), ('" They marched from the Houses of Parliament to a rally in Hyde Park ', {'entities': [(59, 63, 'B-geo'), (64, 68, 'I-geo')]}), ('Police put the number of marchers at 10000 while organizers claimed it was 1,00,000 ', {'entities': []}), ("The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton ", {'entities': [(57, 64, 'B-geo'), (129, 137, 'B-geo'), (75, 80, 'B-org'), (81, 86, 'I-org'), (103, 110, 'B-gpe')]}), ("The party is divided over Britain 's participation in the Iraq conflict 

# New entity label

In [4]:
# New entity labels
# Specify the new entity labels which you want to add here
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 
         'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']


# Loading training data

In [5]:
with open ("spacy/traindata.json", 'rb') as fp:
    TRAIN_DATA = cloudpickle.load(fp)

# Creating model

In [6]:
def create_model(model=None):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()
    return nlp,optimizer    

In [8]:
# download model
! python -m spacy download en

Collecting en_core_web_sm==2.2.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0MB)
[K     |████████████████████████████████| 12.0MB 1.2MB/s eta 0:00:01     |██████████████████████████▉     | 10.1MB 8.3MB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-cp36-none-any.whl size=12019124 sha256=957f3dfe538b40ca2b1434f7f40ecdb47417b7b67256c1e9977c37fda213206a
  Stored in directory: /tmp/pip-ephem-wheel-cache-i4212unk/wheels/48/5c/1c/15f9d02afc8221a668d2172446dd8467b20cdb9aef80a172a4
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/hom

In [9]:
# for models other then english we have to donload the model and then link it
#!python -m spacy link fr_core_news_sm fr

In [10]:
nlp,optimizer=create_model(model="en")

Loaded model 'en'


# Train the model

In [11]:
# Get names of other pipes to disable them during training to train only NER
n_iter=100
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=50)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                        losses=losses)
        print(itn,'Losses', losses)



0 Losses {'ner': 4125.500686168671}
1 Losses {'ner': 3338.1041989326477}
2 Losses {'ner': 3022.024684906006}
3 Losses {'ner': 2817.759494781494}
4 Losses {'ner': 2849.693374633789}
5 Losses {'ner': 3115.110846042633}
6 Losses {'ner': 3265.4912400245667}
7 Losses {'ner': 3342.6642713546753}
8 Losses {'ner': 3412.3434829711914}
9 Losses {'ner': 3191.7587518692017}
10 Losses {'ner': 3096.8059997558594}
11 Losses {'ner': 3197.781873703003}
12 Losses {'ner': 3099.5258235931396}
13 Losses {'ner': 3138.420846939087}
14 Losses {'ner': 3046.1580834388733}
15 Losses {'ner': 3039.970220565796}
16 Losses {'ner': 3042.918041229248}
17 Losses {'ner': 3078.621211051941}
18 Losses {'ner': 3032.7487239837646}
19 Losses {'ner': 2919.697310447693}
20 Losses {'ner': 2970.0690956115723}
21 Losses {'ner': 2929.069356918335}
22 Losses {'ner': 2918.2553033828735}
23 Losses {'ner': 2859.6253757476807}
24 Losses {'ner': 3031.8664956092834}
25 Losses {'ner': 2901.8711433410645}
26 Losses {'ner': 2898.84306716918

# Test the model on test data

In [12]:
test_text = 'Gianni Infantino is the president of FIFA.'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

Entities in 'Gianni Infantino is the president of FIFA.'
B-org Gianni
I-org Infantino
B-org FIFA


# Define scoring function 

In [13]:
def score(model, request):
    payload = request.json["payload"]
    doc = nlp(payload)
    prediction={}
    for ent in doc.ents:
        prediction[ent.label_] = ent.text
    return prediction    
    

# creating payload to test scoring function

In [14]:
import requests
req = requests.Request()
req.json = {"payload":"I am Narendra Modi"}
score(nlp,req)

{'B-per': 'Narendra', 'I-per': 'Modi'}

# Registering the NLP with mosaic

In [16]:
register_model(nlp, score, "spacy_test6", "nlp using spacy", MLModelFlavours.spacy)

{'created_by': 'akhil.lawrence',
 'created_on': '2019-11-04T11:27:15+00:00',
 'description': 'nlp using spacy',
 'flavour': 'spacy',
 'id': 'b2831372-18b1-4878-8e49-a023232afefc',
 'init_script': '',
 'last_modified_by': 'akhil.lawrence',
 'last_modified_on': '2019-11-04T11:27:15+00:00',
 'name': 'spacy_test6',
 'project_id': '1',
 'versions': [{'created_by': 'akhil.lawrence',
   'created_on': '2019-11-04T11:27:16+00:00',
   'deployments': [],
   'description': None,
   'id': '50972500-f8d4-4404-a74c-cff98e7b6a4e',
   'last_modified_by': 'akhil.lawrence',
   'last_modified_on': '2019-11-04T11:27:24+00:00',
   'metadata_info': None,
   'ml_model_id': 'b2831372-18b1-4878-8e49-a023232afefc',
   'object_url': 'b2831372-18b1-4878-8e49-a023232afefc/50972500-f8d4-4404-a74c-cff98e7b6a4e/ml_model.tar.gz',
   'profiling': [],
   'schema': None}]}

# Load registered model back from mosaic

In [14]:
load_model("d2288cd9-8ca8-480d-982d-599cf0e41eae", "cce596df-f7e8-44aa-a59c-d965c5256651")

(<spacy.lang.en.English at 0x7fb5344c9e80>,
 <function __main__.score(model, request)>)