In [3]:
!pip install spacy-transformers
!python -m spacy download ru_core_news_lg

Collecting ru-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.8.0/ru_core_news_lg-3.8.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m768.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:16[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')


In [5]:
import json

import typer
from pathlib import Path

from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from wasabi import Printer
from spacy.tokenizer import Tokenizer
from spacy.lang.ru import Russian
from spacy.util import compile_infix_regex
import re
import spacy

nlp = spacy.blank("ru")
# Create a blank Tokenizer with just the Russian vocab

msg = Printer()

SYMM_LABELS = ["Binds"]
MAP_LABELS = {
    "PART-OF": "PART-OF",
    "LOCATED-AT": "LOCATED-AT",
    "CONNECTED-WITH": "CONNECTED-WITH",
    "IN-MANNER-OF": "IN-MANNER-OF"
}

ann_train = "./dataset/all_relations_train.json"
ann_test = "./dataset/all_relations_test.json"
ann_dev = "./dataset/all_relations_dev.json"
train_file='./rel_component/data/train.spacy'
dev_file='./rel_component/data/dev.spacy'
test_file='./rel_component/data/test.spacy'

def convert_json(json_loc: Path, train_file: Path):
    """Creating the corpus from the Prodigy annotations."""
    Doc.set_extension("rel", default={},force=True)
    vocab = Vocab()

    docs = []

    with open(json_loc, encoding="utf8") as jsonfile:
        file = json.load(jsonfile)
        for example in file:
            span_starts = set()
            neg = 0
            pos = 0
            # Parse the tokens
            tokens=nlp(example["document"])
            
            spaces = [True if tok.whitespace_ else False for tok in tokens]
            words = [t.text for t in tokens]
            doc = Doc(nlp.vocab, words=words, spaces=spaces)

            # Parse the GGP entities
            spans = example["tokens"]
            entities = []
            span_end_to_start = {}
            for span in spans:
                entity = doc.char_span(
                    span["start"], 
                    span["end"], 
                    label=span["entityLabel"]
                )
                span_end_to_start[span["token_start"]] = entity.start
                entities.append(entity)
                span_starts.add(entity.start)

            doc.ents = entities

            # Parse the relations
            rels = {}
            for x1 in span_starts:
                for x2 in span_starts:
                    rels[(x1, x2)] = {}
                    
            relations = example["relations"]
            
            for relation in relations:
                start = span_end_to_start[relation["head"]]
                end = span_end_to_start[relation["child"]]
                label = relation["relationLabel"]
                if label not in rels[(start, end)]:
                    rels[(start, end)][label] = 1.0
                    pos += 1

            # The annotation is complete, so fill in zero's where the data is missing
            for x1 in span_starts:
                for x2 in span_starts:
                    for label in MAP_LABELS.values():
                        if label not in rels[(x1, x2)]:
                            neg += 1
                            rels[(x1, x2)][label] = 0.0

            doc._.rel = rels
            
            # only keeping documents with at least 1 positive case
            if pos > 0:
                docs.append(doc)



    docbin = DocBin(docs=docs, store_user_data=True)
    docbin.to_disk(train_file)
    msg.info(
        f"{len(docs)} total sentences"
    )

In [6]:
convert_json(ann_train, train_file)

[38;5;4mℹ 2005 total sentences[0m


In [7]:
convert_json(ann_dev, dev_file)

[38;5;4mℹ 253 total sentences[0m


In [8]:
convert_json(ann_test, test_file)

[38;5;4mℹ 251 total sentences[0m


In [None]:
!cd ./rel_component && spacy project run train_gpu

[1m
Running command: /usr/bin/python3 -m spacy train configs/rel_trf.cfg --output training --paths.train data/train.spacy --paths.dev data/dev.spacy -c ./scripts/custom_functions.py --gpu-id 0
[38;5;4mℹ Saving to output directory: training[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
2025-03-01 23:04:21.682549: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740870261.703305   37630 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740870261.709736   37630 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 23:04:21.731447: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in perf

In [None]:
!cd ./rel_component && spacy project run evaluate

[1m
Running command: /usr/bin/python3 ./scripts/evaluate.py training/model-best data/test.spacy False
2025-03-02 00:01:05.500605: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740873665.547840   52146 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740873665.563192   52146 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-02 00:01:05.641303: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  self._model.load_sta