In [32]:
# Import necessary libraries
import spacy
from spacy.training.example import Example
import random
import json

# Load your dataset in JSON format
# Replace 'your_dataset.json' with the path to your JSON dataset
# The JSON should have a list of objects with 'text' and 'label' keys
with open('../dataset2.json', 'r') as file:
    data = json.load(file)
print(data[:3])

[{'text': "COUR DE CASSATION \n Première Présidence\n _______\n \n N/réf à rappeler : Ord n° 31656\n Pourvoi N° : D 22-23.988\n Demanderesse : La Ste Excilys\n représentée par la SCP Bénabent\n Défendeur : Monsieur le procureur général près la cour d'appel de Paris\n \n \n \n ORDONNANCE\n \n de la déléguée du premier président de la Cour de cassation,\n \n \n Vu le pourvoi n°D 22-23.988, formé par la société Excilys le 9 décembre 2022 contre un arrêt (RG: 22/19210) rendu le 29 novembre 2022 par la cour d'appel d'Aix-en-Provence pôle 1 chambre 7 ; \n \n Vu la constitution en demande de la SCP Bénabent pour la société Excilys ;\n \n Vu le mémoire ampliatif déposé le 15 décembre 2022 ;\n \n Vu la requête présentée le 15 décembre 2022 par la société Excilys et tendant à l'application de l'article 1009 du code de procédure civile ;\n \n Vu l'avis présenté par M. le procureur général le 20 décembre 2022 ;\n \n S'agissant d'un litige incident intervenant à l'occasion d'une instance actuelleme

In [21]:
from sklearn.model_selection import train_test_split

# Create a spaCy training data format
# This assumes binary classification (you can modify it for multi-class)
train_data = []
for entry in data:
    text = entry['text']
    label = "PROTAGONISTS"
    positions = entry["protagonistsPositions"]
    pos_with_label = []
    for pos in positions:
        pos_with_label.append((pos[0],pos[1], label))
    pos_with_label = list(set(pos_with_label))
    ents = {"entities": pos_with_label}
    
    train_data.append((text, ents))


# Split the examples into training and evaluation sets
train_data, eval_data = train_test_split(train_data, test_size=0.2, random_state=42)



In [33]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm


# Create a blank spaCy model
nlp = spacy.blank("fr")

# Add a Named Entity Recognition (NER) component to the pipeline
ner = nlp.add_pipe("ner")


for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])
optimizer = nlp.begin_training()
# Number of training iterations
n_iter = 15

# Train the model
for itn in range(n_iter):
    random.shuffle(train_data)
    losses = {}

    for text, annotations in tqdm(train_data, desc=f"Iteration {itn+1}/{n_iter}"):
        doc = nlp.make_doc(text)
        # print(doc,annotations)
        example = Example.from_dict(doc, annotations)
        # print("--------------------")
        examples = [example]

        # Update the model with the training examples
        nlp.update(examples, drop=0.5, losses=losses, sgd=optimizer)

    print(f"Iteration {itn+1}/{n_iter} Losses: {losses}")

Iteration 1/15: 100%|██████████| 800/800 [02:47<00:00,  4.78it/s]


Iteration 1/15 Losses: {'ner': 21072.258998938716}


Iteration 2/15: 100%|██████████| 800/800 [02:43<00:00,  4.90it/s]


Iteration 2/15 Losses: {'ner': 3655.7310521945496}


Iteration 3/15: 100%|██████████| 800/800 [02:44<00:00,  4.87it/s]


Iteration 3/15 Losses: {'ner': 2387.563285379872}


Iteration 4/15: 100%|██████████| 800/800 [02:43<00:00,  4.90it/s]


Iteration 4/15 Losses: {'ner': 1824.285329054384}


Iteration 5/15: 100%|██████████| 800/800 [02:46<00:00,  4.81it/s]


Iteration 5/15 Losses: {'ner': 1293.0933566196184}


Iteration 6/15: 100%|██████████| 800/800 [02:43<00:00,  4.90it/s]


Iteration 6/15 Losses: {'ner': 1064.3328172054073}


Iteration 7/15: 100%|██████████| 800/800 [02:43<00:00,  4.90it/s]


Iteration 7/15 Losses: {'ner': 879.7502342678474}


Iteration 8/15: 100%|██████████| 800/800 [02:42<00:00,  4.92it/s]


Iteration 8/15 Losses: {'ner': 863.8062330328215}


Iteration 9/15: 100%|██████████| 800/800 [02:44<00:00,  4.87it/s]


Iteration 9/15 Losses: {'ner': 742.4294944063869}


Iteration 10/15: 100%|██████████| 800/800 [02:41<00:00,  4.96it/s]


Iteration 10/15 Losses: {'ner': 628.7364960028017}


Iteration 11/15: 100%|██████████| 800/800 [02:42<00:00,  4.94it/s]


Iteration 11/15 Losses: {'ner': 535.2589434056188}


Iteration 12/15: 100%|██████████| 800/800 [02:45<00:00,  4.82it/s]


Iteration 12/15 Losses: {'ner': 503.8647681013595}


Iteration 13/15: 100%|██████████| 800/800 [02:43<00:00,  4.90it/s]


Iteration 13/15 Losses: {'ner': 433.8947744667079}


Iteration 14/15: 100%|██████████| 800/800 [02:45<00:00,  4.84it/s]


Iteration 14/15 Losses: {'ner': 425.8595721769032}


Iteration 15/15: 100%|██████████| 800/800 [02:44<00:00,  4.86it/s]

Iteration 15/15 Losses: {'ner': 331.52891228556234}





In [36]:
nlp.to_disk('./model')

In [34]:
for text, _ in train_data[:50]:
    doc = nlp(text)
    print(doc.ents)


(Anne, Lacroix, Anne, Lacroix)
(Laetitia, Olivie, Gautier, Ledoux, Antoine, Weiss, Olivie, Gautier, Weiss, Laetitia, Marchand)
(Bazin, Vincent, Marthe, Vincent)
()
(Valentin, Chauveau, Catherine, Valentin, Gabrielle, Marion, Valentin, Caroline, Chauveau, Adélaïde, Chauveau, Catherine, Valentin, Gabrielle, Marion, Catherine, Valentin, Gabrielle, Marion)
(Antoine, Jourdan, Antoine, Jourdan, Antoine, Jourdan, Jourdan, Jourdan, Jourdan, Jourdan, Jourdan, Jourdan, Jourdan)
(Nathalie, Charpentier, Charpentier, Charpentier, Charpentier, Marchand)
(Françoise, Caron, Caron, Caron, Caron, Caron, Caron)
(Charles, André, Charles, Blanc, André, Charles, Blanc)
(Gabrielle, Prévost, Guillaume, Gabrielle, Prévost, Maryse, Gosselin, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Prévost, Gabrielle, Pré

In [35]:
import spacy
from spacy.training.example import Example
from sklearn.metrics import precision_score, recall_score, f1_score
from spacy.scorer import Scorer

examples = []
scorer = Scorer()
for text, annotations in eval_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    example.predicted = nlp(str(example.predicted))
    examples.append(example)
scorer.score(examples)

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'tag_acc': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'ents_p': 0.9678044057129025,
 'ents_r': 0.9808635917566242,
 'ents_f': 0.9742902400389911,
 'ents_per_type': {'PROTAGONISTS': {'p': 0.9678044057129025,
   'r': 0.9808635917566242,
   'f': 0.9742902400389911}},
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'cats_micro_p': 0.0,
 'cats_micro_r': 0.0,
 'cats_micro_f': 0.0,
 'cats_macro_p': 0.0,
 'cats_macro_r': 0.0,
 'cats_macro_f': 0.0,
 'cats_macro_auc': 0.0,
 'cats_f_per_type': {},
 'cats_auc_per_type': {}}