In [1]:
import os
import spacy
import random
from spacy.training.example import Example
import json

# Initialize spacy model
nlp = spacy.blank("en")

# Add NER component to the pipeline
ner = nlp.add_pipe("ner")

# Process JSON data from all files in the folder
folder_path = "./LabeledDocuments"
train_data = []
i = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        with open(os.path.join(folder_path, filename), "r", encoding='utf-8') as file:
            data = json.load(file)
            for item in data:
                text = item['text']
                entities = item['entities']
                annotated_entities = [(entity['start_idx'], entity['end_idx'], entity['type']) for entity in entities]
                train_data.append((text, {'entities': annotated_entities}))
            print(i)
            i += 1
    if i == 5:
        break

print("read")

# Add labels to the NER model
for _, annotations in train_data:
    for ent in annotations['entities']:
        ner.add_label(ent[2])

# Disable other pipeline components for training efficiency
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Train NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for itn in range(50):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            try:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.5, losses=losses)
            except ValueError as e:
                # print(f"Skipping due to error: {e}")
                continue
        print(losses)

# Save model to disk
nlp.to_disk("ner_model")

0
1
2
3
4
read
{'ner': 1923.2981208569893}
{'ner': 1518.1529237388527}
{'ner': 1416.0077459701604}
{'ner': 1257.227313960797}
{'ner': 1203.5412200579958}
{'ner': 1181.5695284072667}
{'ner': 1168.5182141037058}
{'ner': 1082.522767270963}
{'ner': 1048.942361488448}
{'ner': 925.0700986947335}
{'ner': 960.2922230940351}
{'ner': 918.0603782547332}
{'ner': 876.4667416217623}
{'ner': 924.0409272217455}
{'ner': 890.2406860320168}
{'ner': 811.3436742375773}
{'ner': 810.4277843185271}
{'ner': 775.9716264332129}
{'ner': 709.2989910975}
{'ner': 737.3036989655823}
{'ner': 748.8324331081252}
{'ner': 708.753897760465}
{'ner': 671.3431049138585}
{'ner': 662.629989722966}
{'ner': 666.8415357894233}
{'ner': 626.5273407139224}
{'ner': 579.8109965143909}



KeyboardInterrupt

