In [None]:
!pip install spacy
!pip install spacy-transformers
!python -m spacy download ru_core_news_lg

In [None]:
!python -m spacy convert ./"init dataset"/all_ner_test.tsv ./data -t json -n 1 -c iob -l ru
!python -m spacy convert ./"init dataset"/all_ner_test.tsv ./data -t json -n 1 -c iob -l ru

In [None]:
!python -m spacy convert ./data/all_ner_train.json ./data/ -t spacy
!python -m spacy convert ./data/all_ner_test.json ./data/ -t spacy

In [None]:
import typer
from pathlib import Path

import spacy

def create_config(model_name: str, component_to_update: str, output_path: Path):
    nlp = spacy.load(model_name)

    # create a new config as a copy of the loaded pipeline's config
    config = nlp.config.copy()

    # revert most training settings to the current defaults
    default_config = spacy.blank(nlp.lang).config
    config["corpora"] = default_config["corpora"]
    config["training"]["logger"] = default_config["training"]["logger"]

    config["initialize"]["before_init"] = {
        "@callbacks": "spacy.copy_from_base_model.v1",
        "tokenizer": model_name,
        "vocab": model_name,
    }
    config["initialize"]["lookups"] = None
    config["initialize"]["vectors"] = None

    config["training"]["frozen_components"] = []
    for pipe_name in nlp.component_names:
        if pipe_name != component_to_update:
            config["components"][pipe_name] = {"source": model_name}
            config["training"]["frozen_components"].append(pipe_name)
        else:
            config["components"][pipe_name] = {
                "source": model_name,
                "replace_listeners": ["model.tok2vec"],
            }

    config.to_disk(output_path)


In [None]:
!pip install "ru_patents_ner @ https://huggingface.co/Delicalib/ru_patents_ner/resolve/main/ru_patents_ner-any-py3-none-any.whl"

In [None]:
create_config("ru_patents_ner", "ner", "./pretrain_config.cfg")

In [None]:
!python -m spacy train ./pretrain_config.cfg --output ./training/ --paths.train ./data/all_ner_train.spacy --paths.dev ./data/all_ner_test.spacy --training.eval_frequency 10 --training.max_steps 500 --training.patience 50 --gpu-id 0