https://huggingface.co/cointegrated/LaBSE-en-ru
https://huggingface.co/DeepPavlov/rubert-base-cased

In [None]:
pip install transformers datasets evaluate accelerate

In [10]:
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,BertTokenizer, BertForSequenceClassification,DataCollatorWithPadding

# Взаимоотношение между текстами

In [None]:
model_checkpoint = 'cointegrated/rubert-base-cased-nli-twoway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
if torch.cuda.is_available():
    model.cuda()

In [None]:
text1 = 'Сократ - человек, а все люди смертны.'
text2 = 'Сократ никогда не умрёт.'

In [None]:
with torch.inference_mode():
    out = model(**tokenizer(text1, text2, return_tensors='pt').to(model.device))
    proba = torch.softmax(out.logits, -1).cpu().numpy()[0]

In [None]:
print({v: proba[k] for k, v in model.config.id2label.items()})

# Классификация текстов

In [14]:
model_name = "cointegrated/rubert-tiny2"  # компактная русскоязычная модель
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # если у вас два класса

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
if torch.cuda.is_available():
    model.cuda()

In [43]:
ds = load_dataset("ai-forever/ru-reviews-classification")

In [17]:
def preprocess_function(examples):
    return tokenizer(examples["text"],truncation=True)

In [18]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label_text', 'label', 'id'],
        num_rows: 45000
    })
    validation: Dataset({
        features: ['text', 'label_text', 'label', 'id'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['text', 'label_text', 'label', 'id'],
        num_rows: 15000
    })
})

In [44]:
train = ds['train']
val = ds['validation']
test = ds['test']
del ds

In [20]:
len(set([elem['label'] for elem in train]))

3

In [21]:
train[0]

{'text': 'всё пришло спасибо. только немного короче чем я ожидала\nтак всё супер',
 'label_text': 'positive',
 'label': 2,
 'id': '64971'}

In [22]:
train = train.map(preprocess_function,remove_columns=['text','id','label_text'])

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

In [23]:
val = val.map(preprocess_function,remove_columns=['text','id','label_text'])

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
accuracy = evaluate.load("accuracy")

In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [33]:
training_args = TrainingArguments(
    output_dir="cache",
    learning_rate=3e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    fp16=True
)   

# Создание тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



In [34]:
# Обучение модели
trainer.train()

  0%|          | 0/1408 [00:00<?, ?it/s]

{'loss': 0.17, 'grad_norm': 6.078038215637207, 'learning_rate': 0.00019346590909090908, 'epoch': 0.71}


  0%|          | 0/1875 [00:00<?, ?it/s]

{'eval_loss': 0.9716960787773132, 'eval_accuracy': 0.7182666666666667, 'eval_runtime': 8.125, 'eval_samples_per_second': 1846.154, 'eval_steps_per_second': 230.769, 'epoch': 1.0}
{'loss': 0.156, 'grad_norm': 7.450245380401611, 'learning_rate': 8.693181818181818e-05, 'epoch': 1.42}


  0%|          | 0/1875 [00:00<?, ?it/s]

{'eval_loss': 1.2525031566619873, 'eval_accuracy': 0.7192, 'eval_runtime': 7.591, 'eval_samples_per_second': 1976.024, 'eval_steps_per_second': 247.003, 'epoch': 2.0}
{'train_runtime': 85.9629, 'train_samples_per_second': 1046.963, 'train_steps_per_second': 16.379, 'train_loss': 0.14866039698774164, 'epoch': 2.0}


TrainOutput(global_step=1408, training_loss=0.14866039698774164, metrics={'train_runtime': 85.9629, 'train_samples_per_second': 1046.963, 'train_steps_per_second': 16.379, 'total_flos': 211378782826368.0, 'train_loss': 0.14866039698774164, 'epoch': 2.0})

In [35]:
torch.cuda.empty_cache()

In [36]:
model.eval() 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-

In [46]:
del train
del val

In [38]:
from tqdm import tqdm

In [45]:
label = test['label']
test = test.remove_columns(['label'])

In [47]:
test[0]

{'text': 'Майка очень коротка, после стирки ещё сильнее села, к продавцу претензий нет)',
 'label_text': 'negative',
 'id': '17467'}

In [48]:
lst=[]
with torch.no_grad():
    for input in tqdm(test['text']):
        t = tokenizer(input, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**{k: v.to(model.device) for k, v in t.items()}).logits.argmax(-1)
        lst.append(int(outputs.cpu()))

100%|██████████| 15000/15000 [00:46<00:00, 325.26it/s]


In [51]:
print(accuracy.compute(predictions=lst, references=label))

{'accuracy': 0.7193333333333334}
