In [1]:
MODEL_NAME = "./models/self_model_multi"
THRESHOLD = 0.9

LABELS = ['greeting', 'how are you', 'unknown']
LABEL_MAP = {i: LABELS[i] for i in range(0, len(LABELS))}
ID_MAP = {value: key for key, value in LABEL_MAP.items()}

In [2]:
from transformers import BertTokenizer, BertModel, BertConfig
import os
import json
import torch
import torch.nn as nn

MODEL_FILE = "model.bin"
CONFIG_FILE = "config.json"

class BertClassifier(nn.Module):
    def __init__(self, config):
        super(BertClassifier, self).__init__()
        self.config = config
        self.bert = BertModel.from_pretrained('ai-forever/sbert_large_nlu_ru', config=config)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(0.1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return probabilities
    def save_pretrained(self, save_directory):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)

        config_path = os.path.join(save_directory, CONFIG_FILE)
        with open(config_path, 'w') as f:
            json.dump(self.config.__dict__, f)

        model_path = os.path.join(save_directory, MODEL_FILE)
        torch.save(self.state_dict(), model_path)
    @classmethod
    def from_pretrained(cls, pretrained_model_path, *model_args, **kwargs):
        config = BertConfig.from_pretrained(pretrained_model_path)
        model = cls(config, *model_args, **kwargs)
        model.load_state_dict(torch.load(os.path.join(pretrained_model_path, MODEL_FILE)))
        model.eval()
        return model

model = BertClassifier.from_pretrained(MODEL_NAME)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, 

In [4]:
import pandas as pd
import numpy as np

DATASET_NAME = "./datasets/ru-plus.csv"

df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ['text', 'label']
df['label'] = df['label'].astype(int)
df_label_0 = df[df["label"] == 0]
df_label_1 = df[df["label"] == 1]
multi_class = []
for i in range(50):
    row_0 = df_label_0.sample(n=1, random_state=np.random.RandomState())
    row_1 = df_label_1.sample(n=1, random_state=np.random.RandomState())
    text = f"{row_0['text'].values[0]}, {row_1['text'].values[0].lower()}"
    multi_class.append({'text': text, 'label': 3})

df = pd.concat([df, pd.DataFrame(multi_class)], ignore_index=True)
df['label'] = df['label'].map({0: [1, 0], 1: [0, 1], 2: [0, 0], 3: [1, 1]})

texts = df['text'].tolist()
labels = df['label'].tolist()

In [5]:
def classify_text(model, tokenizer, text, threshold):
    model.eval()
    with torch.no_grad():
        tokenized_value = tokenizer(text,  padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        output_value = model(tokenized_value['input_ids'].to(device), attention_mask=tokenized_value['attention_mask'].to(device))
        predictions = output_value[0].cpu().numpy()
        output_value = predictions > threshold
        return output_value.astype(int).tolist(), predictions.tolist()

In [6]:
import evaluate
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [7]:
counter = 0
results = []
for test_text, test_label in zip(texts, labels):
    inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    outputs, predictions = classify_text(model, tokenizer, test_text, THRESHOLD)
    results.append(outputs)
    if outputs != test_label:
        counter += 1
        print(f"Текст: {test_text}")
        print(f"Предсказано: {outputs}, Значение: {test_label}, Результат: {predictions}")
        print()
print(f"Всего ошибочно: {counter}")

print(', '.join(f"{key}: {value:.4f}" for key, value in metric.compute(predictions=np.array(results).reshape(-1), references=np.array(labels).reshape(-1)).items()))

Всего ошибочно: 0
accuracy: 1.0000, f1: 1.0000, precision: 1.0000, recall: 1.0000
