<a href="https://colab.research.google.com/github/ArielUW/FundamentalsNLP/blob/main/LST_NLP_01042025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#NER CUSTOM MODEL

##imports

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
import pandas as pd
from spacy import displacy
from pathlib import Path
from multiprocessing import Pool

##annotations processing

In [None]:
with open('/content/annotations(18).json', 'r', encoding='utf-8') as file:
    data_to_transform = json.load(file)


def transform_data(data):
    transformed_data = []
    for record in data['annotations']:
        if record and isinstance(record, list) and len(record) == 2:
            text, entity_data = record
            if 'entities' in entity_data:
                entities = entity_data["entities"]
                transformed_entities = []

                for entity in entities:
                    start, end, label = entity
                    transformed_entities.append([start, end, label])

                transformed_data.append([text, {"entities": transformed_entities}])
            else:
                print(f'Pominięto rekord bez "entities": {record}')
        else:
            print(f'Pominięto nieprawidłowy rekord: {record}')
    return transformed_data

transformed_annotations = transform_data(data_to_transform)

output_file_path = '/content/data_transformed.json'
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(transformed_annotations, file, ensure_ascii=False, indent=4)

print(f'Transformacja zakończona. Wynik zapisano w pliku: {output_file_path}')

cv_data = json.load(open('/content/data_transformed.json','r'))



##configuration files

In [None]:
#config template: https://spacy.io/usage/training

In [None]:
!python -m spacy init fill-config /content/base_config.cfg /content/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


##prep for training function

In [None]:
def get_spacy_doc(file, data):
  nlp = spacy.blank('en')
  db = DocBin()
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

##train/test split

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(cv_data, test_size=0.2)

len(train), len(test)
file = open('/content/train_file.txt','w')
db = get_spacy_doc(file, train)
db.to_disk('/content/train_data.spacy')
db = get_spacy_doc(file, test)
db.to_disk('/content/test_data.spacy')
file.close()

100%|██████████| 7/7 [00:00<00:00, 528.00it/s]
100%|██████████| 2/2 [00:00<00:00, 389.66it/s]


##training procedure

In [None]:
!python -m spacy train /content/config.cfg  --output /content/output  --paths.train /content/train_data.spacy  --paths.dev /content/test_data.spacy


##implementation

In [None]:
nlp = spacy.load('/content/output/model-best')

In [None]:
text = "Fetting, Constanze. 'The European green deal.' ESDN Report, December 2.9 (2020): 53."

In [None]:
doc = nlp(text)

In [None]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Fetting, Constanze AUTHOR
2020 DATE
