In [1]:
from doctest import Example
import spacy
import json 
import random
from spacy.util import minibatch
from spacy.training import Example

def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def train_spacy(TRAIN_DATA, iterations):
    nlp = spacy.blank("fr")
    nlp.add_pipe("ner", name="from_to_location")
    nlp.get_pipe("from_to_location").add_label("FROM_LOC")
    nlp.get_pipe("from_to_location").add_label("TO_LOC")

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "from_to_location"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        examples = []
        for text, annots in TRAIN_DATA:    
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        nlp.initialize(lambda: examples)
        for itn in range(iterations):
            print(f"Starting iteration {str(itn)}.")
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in minibatch(examples, size=8):
                nlp.update(batch)
            print(losses)
    return (nlp)


In [2]:
TRAIN_DATA = load_data("data/lang/fr-annotated.json")
random.shuffle(TRAIN_DATA)

In [18]:
trained = train_spacy(TRAIN_DATA, 5)
trained.to_disk("pipeline/ner")

Starting iteration 0.
{}
Starting iteration 1.
{}
Starting iteration 2.
{}
Starting iteration 3.
{}
Starting iteration 4.
{}


In [11]:
doc = trained("Je souhaiterais aller de Haubourdin Ã  Loos")
for ent in doc.ents:
    print(ent.text, ent.label_)

Haubourdin FROM_LOC
Loos TO_LOC
