In [32]:
from doctest import Example
import spacy
import json 
import random
from spacy.util import minibatch
from spacy.training import Example

def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def train_spacy(TRAIN_DATA, iterations):
    nlp = spacy.blank("fr")
    nlp.add_pipe("ner", name="from_to_location")
    nlp.get_pipe("from_to_location").add_label("FROM_LOC")
    nlp.get_pipe("from_to_location").add_label("TO_LOC")

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "from_to_location"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        examples = []
        for text, annots in TRAIN_DATA:    
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        nlp.initialize(lambda: examples)
        for itn in range(iterations):
            print(f"Starting iteration {str(itn)}.")
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in minibatch(examples, size=8):
                nlp.update(batch, losses=losses)
            print(losses)
    return (nlp)


In [33]:
TRAIN_DATA = load_data("data/lang/fr-annotated.json")
random.shuffle(TRAIN_DATA)

In [48]:
# Train the NER, add the language detector pipe, and save the NLP component to disk
from spacy_fastlang import LanguageDetector
from spacy.language import Language

@Language.factory("detect_lang")
def get_lang_detector(nlp, name):
    return LanguageDetector()

trained = train_spacy(TRAIN_DATA, 5)
trained.add_pipe("detect_lang", name="detect_lang", before="from_to_location")
trained.to_disk("lang_detector_from_to_model")
print(trained.pipe_names)
print(trained)

Starting iteration 0.
{'from_to_location': 985.2185245658754}
Starting iteration 1.
{'from_to_location': 250.44565675412548}
Starting iteration 2.
{'from_to_location': 15.009479746299364}
Starting iteration 3.
{'from_to_location': 2.0579397564555446}
Starting iteration 4.
{'from_to_location': 0.672209210526212}
['detect_lang', 'from_to_location']
<spacy.lang.fr.French object at 0x000001E55739DB88>




In [49]:
doc = trained("Je souhaiterais aller de Haubourdin à Loos")
for ent in doc.ents:
    print(ent.text, ent.label_)
    print(doc._.language)

Haubourdin FROM_LOC
fr
Loos TO_LOC
fr
