In [7]:
from doctest import Example
import spacy
import json 
import random
from spacy.util import minibatch
from spacy.training import Example

def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def train_spacy(TRAIN_DATA, iterations):
    nlp = spacy.blank("fr")
    nlp.add_pipe("ner", name="from_to_location")
    nlp.get_pipe("from_to_location").add_label("FROM_LOC")
    nlp.get_pipe("from_to_location").add_label("TO_LOC")

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "from_to_location"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        examples = []
        for text, annots in TRAIN_DATA:    
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        nlp.initialize(lambda: examples)
        for itn in range(iterations):
            print(f"Starting iteration {str(itn)}.")
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in minibatch(examples, size=8):
                nlp.update(batch, losses=losses)
            print(losses)
    return (nlp)


In [2]:
TRAIN_DATA = load_data("data/lang/fr-annotated.json")
random.shuffle(TRAIN_DATA)

In [8]:
# Train the NER, add the language detector pipe, and save the NLP component to disk

import spacy_fastlang

trained = train_spacy(TRAIN_DATA, 5)
trained.add_pipe("language_detector", before="from_to_location")
trained.to_disk("pipeline/ner")

Starting iteration 0.
{'from_to_location': 986.8641767562799}
Starting iteration 1.
{'from_to_location': 137.09842827591865}
Starting iteration 2.
{'from_to_location': 17.463515311726805}
Starting iteration 3.
{'from_to_location': 3.229296257470024}
Starting iteration 4.
{'from_to_location': 2.2223147317431224}




In [6]:
doc = trained("Je souhaiterais aller de Haubourdin à Loos")
for ent in doc.ents:
    print(ent.text, ent.label_)
    print(doc._.language)

Haubourdin FROM_LOC
fr
Loos TO_LOC
fr
