In [6]:
import spacy
import pickle
from datasets import Dataset, ClassLabel, Sequence
import tqdm
import multiprocessing
import bz2

In [2]:
nlp = spacy.load("nl_core_news_md")

In [3]:
with open("nos.pkl", "rb") as file:
  corpus = pickle.load(file)
  print("Loaded corpus, containing", corpus["tokens"], "tokens in", corpus["docs"], "documents.")

Loaded corpus, containing 1000682 tokens in 5897 documents.


In [4]:
def relabel(ent_label: str) -> str:
  """
  returns ConLL-2002 label of Spacy labelled entity
  """
  mappings = {"PERSON":"PER", "COMPANY":"ORG", "GPE":"LOC", 'EVENT':"MISC", 'FAC':"MISC", 'LANGUAGE':"MISC", 'LAW':"MISC", 'NORP':"MISC", 'PRODUCT':"MISC",'WORK_OF_ART':"MISC", "MISC":"MISC", "PER":"PER", "ORG":"ORG", "LOC":"LOC"}    
  exclude = {"CARDINAL", "ORDINAL", "DATE", "PERCENT", "QUANTITY", "TIME", "MONEY"}

  return mappings[ent_label] if ent_label != "" and ent_label not in exclude else None

def convert_ent(token) -> str:
  """
  returns ConLL-2002 IOB style entity label of Spacy token
  """
  return token.ent_iob_ + "-" + relabel(token.ent_type_) if relabel(token.ent_type_) else "O"

def process_spacy(docs: list):
  store = []
  tokens = []
  ids = []

  c = 0
  classlabels = ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
  for doc in tqdm.notebook.tqdm(nlp.pipe(docs, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])):
    ents = [classlabels.str2int(convert_ent(tok)) for tok in doc]
    toks = [token.text for token in doc]
    store.append(ents)
    tokens.append(toks)
    ids.append(str(c))
    c += 1 
    
  d = {"ids" : ids,
       "ner_tags" : store,
       "tokens" : tokens}

  class_sequence = Sequence(feature =  classlabels, id = None)
  ds = Dataset.from_dict(d)
  ds.features["ner_tags"] = class_sequence
  return ds

In [5]:
dataset = process_spacy(corpus["texts"])

0it [00:00, ?it/s]

In [8]:
target_path = bz2.BZ2File("data/nos.bz2", 'w')

pickle.dump(dataset, target_path)

target_path.close()

In [81]:
import zipfile 

zipfile.ZipFile("data/nos.zip", "w").write("nos.train", compress_type = zipfile.ZIP_DEFLATED)