In [5]:
!unzip archive.zip

Archive:  archive.zip
  inflating: dev.jsonl               
  inflating: test.jsonl              
  inflating: train.jsonl             


In [6]:
!pip install spacy
%pip install -U 'spacy[transformers]'
!pip install thinc==8.2.3

Collecting spacy-transformers<1.4.0,>=1.1.2 (from spacy[transformers])
  Using cached spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers<1.4.0,>=1.1.2->spacy[transformers])
  Using cached transformers-4.36.2-py3-none-any.whl (8.2 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.8.0->spacy-transformers<1.4.0,>=1.1.2->spacy[transformers])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.8.0->spacy-transformers<1.4.0,>=1.1.2->spacy[transformers])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.8.0->spacy-transformers<1.4.0,>=1.1.2->spacy[transformers])
  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
Collecting nvidia-cusparse-cu12==12.1.0.106 (from torc

In [36]:
#!pip install spacy-transformers
%pip install -U 'spacy[transformers]'




In [1]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
from sklearn.model_selection import train_test_split
from datetime import datetime

In [2]:
train = pd.read_json('train.jsonl', lines=True)
dev = pd.read_json('dev.jsonl', lines=True)
test = pd.read_json('test.jsonl', lines=True)

train, val = train_test_split(train, test_size=0.2, random_state=42)
train.reset_index(inplace=True)
val.reset_index(inplace=True)

dev.rename(columns={"senences": "sentences"}, inplace=True)
test.rename(columns={"senences": "sentences"}, inplace=True)

In [9]:
train.head()

Unnamed: 0,index,ners,sentences,id
0,433,"[[4, 12, CITY], [64, 82, DATE], [326, 333, CIT...",Под Алма-Атой разбился пассажирский самолёт\n\...,433
1,517,"[[0, 9, NATIONALITY], [58, 72, PERSON], [101, ...",Российский магнат устроил самую дорогую свадьб...,517
2,208,"[[11, 18, STATE_OR_PROVINCE], [27, 31, NUMBER]...",Стрельба в Мичигане: убиты шесть человек\nКала...,208
3,332,"[[0, 10, PERSON], [29, 39, PERSON], [45, 47, P...",Юрий Лужков займётся грибами\nЮрий Лужков\nЭкс...,332
4,220,"[[42, 47, COUNTRY], [66, 72, NATIONALITY], [82...","Одолев в четвертьфинале канадцев, сборная Росс...",220


In [10]:
with open('base_config.cfg', 'w') as f:
    f.write("""
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null
vectors = null
[system]
gpu_allocator = "pytorch"

[nlp]
lang = "ru"
pipeline = ["transformer","ner"]
batch_size = 1

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "bert-base-multilingual-uncased"
tokenizer_config = {"use_fast": true}

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 32
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 1000
buffer = 64

[initialize]
vectors = ${paths.vectors}
""")

In [11]:
!python -m spacy init fill-config base_config.cfg config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
nlp = spacy.blank("ru")
doc_bin = DocBin()

In [13]:
def prepare_dataset(data, out_name="./training_data.spacy"):
    skipped = 0
    for _, row in tqdm(data.iterrows()):
        tokens = row['sentences']
        doc = nlp.make_doc(tokens)
        ents = []
        index = 0
        for ner in row['ners']:
            start, end, label = ner
            span = doc.char_span(start, end+1, label=label, alignment_mode="contract")

            if span is None or span.text != span.text.strip():
                skipped += 1
                continue
            ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)

    print(skipped)
    doc_bin.to_disk(out_name)

In [14]:
prepare_dataset(train, "./training_data.spacy")
prepare_dataset(val, "./validation_data.spacy")

415it [00:04, 100.36it/s]


146


104it [00:00, 106.24it/s]


29


In [15]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy --gpu-id 0 #--gpu-id 1

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
tokenizer_config.json: 100% 48.0/48.0 [00:00<00:00, 225kB/s]
config.json: 100% 625/625 [00:00<00:00, 3.18MB/s]
vocab.txt: 100% 872k/872k [00:00<00:00, 9.48MB/s]
tokenizer.json: 100% 1.72M/1.72M [00:00<00:00, 20.5MB/s]
model.safetensors: 100% 672M/672M [00:05<00:00, 122MB/s]
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        1179.42    447.41    0.50    0.46    0.54    0.00
  1     200      380412.44  71387.06   50.71   59.50   44.19    0.51
  2     400       34257.96  28126.42   75.40   78.36   72.66    0.75
  4     600       26412.14  20732.51   83.34   83.67   83.02    0.83
  5     800       12181.72 

In [3]:
def predict_ner(text):
    nlp_ner = spacy.load("/content/model-best")
    doc = nlp_ner(text)
    out = []
    for ent in doc.ents:
        start = ent.start_char
        end = ent.end_char - 1
        label = ent.label_
        out_list = [start, end, label]
        out.append(out_list)

    return out

In [None]:
test['ners'] = test.sentences.apply(predict_ner)
test.drop(['sentences'], axis=1, inplace=True)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
output_path = 'test.jsonl'
with open(output_path, "w") as f:
    f.write(test.to_json(orient='records', lines=True, force_ascii=False))
!zip test test.jsonl

In [34]:
!zip -r model-best.zip model-best

updating: model-best/ (stored 0%)
  adding: model-best/transformer/ (stored 0%)
  adding: model-best/transformer/cfg (stored 0%)
  adding: model-best/transformer/model (deflated 8%)
  adding: model-best/tokenizer (deflated 84%)
  adding: model-best/meta.json (deflated 70%)
  adding: model-best/vocab/ (stored 0%)
  adding: model-best/vocab/vectors (deflated 45%)
  adding: model-best/vocab/strings.json (deflated 90%)
  adding: model-best/vocab/lookups.bin (stored 0%)
  adding: model-best/vocab/vectors.cfg (stored 0%)
  adding: model-best/vocab/key2row (stored 0%)
  adding: model-best/ner/ (stored 0%)
  adding: model-best/ner/cfg (deflated 33%)
  adding: model-best/ner/moves (deflated 80%)
  adding: model-best/ner/model (deflated 8%)
  adding: model-best/config.cfg (deflated 61%)


In [32]:
ls

archive.zip      config.cfg  [0m[01;34mmodel-best[0m/  [01;34msample_data[0m/  test.zip             train.jsonl
base_config.cfg  dev.jsonl   [01;34mmodel-last[0m/  test.jsonl    training_data.spacy  validation_data.spacy
