In [1]:
import spacy
import pandas as pd
from tqdm.notebook import tqdm
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [7]:
nlp = spacy.load("ru_core_news_sm")
print(nlp.pipe_names)

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [8]:
df = pd.read_csv("../women-clothing-accessories.3-class.balanced.csv", delimiter='\t')
df = df = df[df['sentiment'] != 'neautral']
train_ds, test_ds = train_test_split(df.values, test_size=0.2, random_state=42)

In [9]:
def make_docs(data):
    """
        this will take a list of texts and labels
        and transform them in spacy documents
        data: list(tuple(text, label))
        returns: List(spacy.Doc.doc)
    """
    docs = []
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True, n_process=8), total = len(data)):
        if label == 'negative':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        docs.append(doc)
    return docs

train_docs = make_docs(train_ds)
valid_docs = make_docs(test_ds)

  0%|          | 0/48000 [00:00<?, ?it/s]

  0%|          | 0/12000 [00:00<?, ?it/s]

In [10]:
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

In [12]:
! python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [13]:
! python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./valid.spacy --output ./output --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  -------------  ------------  ----------  ------
  0       0           0.00          0.75       33.67    0.34
  0     200           0.35        113.91       91.98    0.92
  0     400           1.66         45.06       92.35    0.92
  0     600           2.21         43.61       93.02    0.93
  0     800           2.62         41.33       93.41    0.93
  1    1000           2.73         40.08       93.30    0.93
  1    1200           2.69         32.39       93.72    0.94
  1    1400           3.49         31.17       93.11    0.93
  1    1600           3.90         36.20       93.35    0.93
  1    1800           4.60         42.15       92.77    0.93
  2    2000           4.55         40.23       93.23 