In [1]:
import evaluate
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertModel, BertConfig, BertTokenizer
from torch.nn import BCELoss
import torch
import torch.nn as nn
import os
import json

In [2]:
DATASET_NAME = "./datasets/ru-plus.csv"
TEST_SIZE = 0.3
MODEL_NAME = "ai-forever/sbert_large_nlu_ru"
SAVE_DIRECTORY = "./models/self_model_multi"
LABELS = 3

In [3]:
df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ["text", "label"]
df_label_0 = df[df["label"] == 0]
df_label_1 = df[df["label"] == 1]
multi_class = []
for i in range(50):
    row_0 = df_label_0.sample(n=1, random_state=np.random.RandomState())
    row_1 = df_label_1.sample(n=1, random_state=np.random.RandomState())
    text = f"{row_0['text'].values[0]}, {row_1['text'].values[0].lower()}"
    multi_class.append({'text': text, 'label': 3})

df = pd.concat([df, pd.DataFrame(multi_class)], ignore_index=True)
df['label'] = df['label'].map({0: [1, 0, 0], 1: [0, 1, 0], 2: [0, 0, 1], 3: [1, 1, 0]})

In [4]:
train, test_valid = train_test_split(df, test_size=TEST_SIZE, shuffle=True)
test, valid = train_test_split(test_valid, test_size=0.5)

In [5]:
THRESHOLD = 0.9
LEARNING_RATE = 1e-5
BATCH_SIZE = 16
NUM_EPOCHS = 32

In [6]:
bert_config = BertConfig.from_pretrained(MODEL_NAME, num_labels=LABELS)

In [7]:
MODEL_FILE = "model.bin"
CONFIG_FILE = "config.json"

class BertClassifier(nn.Module):
    def __init__(self, config):
        super(BertClassifier, self).__init__()
        self.config = config
        self.bert = BertModel.from_pretrained(MODEL_NAME, config=config)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return probabilities
    def save_pretrained(self, save_directory):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)

        config_path = os.path.join(save_directory, CONFIG_FILE)
        with open(config_path, 'w') as f:
            json.dump(self.config.__dict__, f)

        model_path = os.path.join(save_directory, MODEL_FILE)
        torch.save(self.state_dict(), model_path)
    @classmethod
    def from_pretrained(cls, pretrained_model_path, *model_args, **kwargs):
        config = BertConfig.from_pretrained(pretrained_model_path)
        model = cls(config, *model_args, **kwargs)
        model.load_state_dict(torch.load(os.path.join(pretrained_model_path, MODEL_FILE)))
        model.eval()
        return model

In [8]:
model = BertClassifier(bert_config)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print()




In [10]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = BCELoss()

In [11]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128)
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['texts'] = self.texts[idx]
        return item

In [12]:
train_texts = train['text'].tolist()
train_labels = train['label'].tolist()
dataset = TextDataset(train_texts, train_labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

valid_texts = valid['text'].tolist()
valid_labels = valid['label'].tolist()
valid_dataset = TextDataset(valid_texts, valid_labels)
validation_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

In [13]:
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(predictions, references):
    return metric.compute(predictions=np.array(predictions).astype(int).reshape(-1), references=np.array(references).astype(int).reshape(-1))

In [14]:
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].float().to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = outputs.cpu().numpy() > THRESHOLD
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    return total_loss / len(dataloader), compute_metrics(all_preds, all_labels)

In [15]:
prev_valid_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    model.train()
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
         
    validation_loss, scores = validate(model, validation_dataloader, device)
    tqdm.write(f"Training Loss: {loss.item():.4f}, Validation Loss: {validation_loss:.4f}, {', '.join(f"{key}: {value:.4f}" for key, value in scores.items())}")
    print()
    if prev_valid_loss < validation_loss:
        break
    else:
        prev_valid_loss = validation_loss
        model.save_pretrained(SAVE_DIRECTORY)

Epoch 1/32


Training: 100%|██████████| 13/13 [00:16<00:00,  1.25s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Loss: 0.5591, Validation Loss: 0.5050, accuracy: 0.6111, f1: 0.0000, precision: 0.0000, recall: 0.0000

Epoch 2/32


Training: 100%|██████████| 13/13 [00:15<00:00,  1.23s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Loss: 0.3204, Validation Loss: 0.4029, accuracy: 0.6111, f1: 0.0000, precision: 0.0000, recall: 0.0000

Epoch 3/32


Training: 100%|██████████| 13/13 [00:15<00:00,  1.19s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Loss: 0.2695, Validation Loss: 0.3172, accuracy: 0.6111, f1: 0.0000, precision: 0.0000, recall: 0.0000

Epoch 4/32


Training: 100%|██████████| 13/13 [00:13<00:00,  1.04s/it]


Training Loss: 0.1712, Validation Loss: 0.2389, accuracy: 0.6508, f1: 0.1852, precision: 1.0000, recall: 0.1020

Epoch 5/32


Training: 100%|██████████| 13/13 [00:14<00:00,  1.09s/it]


Training Loss: 0.1277, Validation Loss: 0.1831, accuracy: 0.7778, f1: 0.6000, precision: 1.0000, recall: 0.4286

Epoch 6/32


Training: 100%|██████████| 13/13 [00:12<00:00,  1.03it/s]


Training Loss: 0.1062, Validation Loss: 0.1419, accuracy: 0.8571, f1: 0.7750, precision: 1.0000, recall: 0.6327

Epoch 7/32


Training: 100%|██████████| 13/13 [00:11<00:00,  1.10it/s]


Training Loss: 0.0763, Validation Loss: 0.1158, accuracy: 0.9603, f1: 0.9462, precision: 1.0000, recall: 0.8980

Epoch 8/32


Training: 100%|██████████| 13/13 [00:13<00:00,  1.02s/it]


Training Loss: 0.0715, Validation Loss: 0.1020, accuracy: 0.9603, f1: 0.9462, precision: 1.0000, recall: 0.8980

Epoch 9/32


Training: 100%|██████████| 13/13 [00:13<00:00,  1.04s/it]


Training Loss: 0.0535, Validation Loss: 0.0932, accuracy: 0.9524, f1: 0.9362, precision: 0.9778, recall: 0.8980

Epoch 10/32


Training: 100%|██████████| 13/13 [00:13<00:00,  1.02s/it]


Training Loss: 0.0473, Validation Loss: 0.0872, accuracy: 0.9683, f1: 0.9583, precision: 0.9787, recall: 0.9388

Epoch 11/32


Training: 100%|██████████| 13/13 [00:12<00:00,  1.04it/s]


Training Loss: 0.0408, Validation Loss: 0.0809, accuracy: 0.9762, f1: 0.9691, precision: 0.9792, recall: 0.9592

Epoch 12/32


Training: 100%|██████████| 13/13 [00:13<00:00,  1.04s/it]


Training Loss: 0.0323, Validation Loss: 0.0758, accuracy: 0.9841, f1: 0.9796, precision: 0.9796, recall: 0.9796

Epoch 13/32


Training: 100%|██████████| 13/13 [00:10<00:00,  1.18it/s]


Training Loss: 0.0306, Validation Loss: 0.0733, accuracy: 0.9841, f1: 0.9796, precision: 0.9796, recall: 0.9796

Epoch 14/32


Training: 100%|██████████| 13/13 [00:11<00:00,  1.13it/s]


Training Loss: 0.0258, Validation Loss: 0.0792, accuracy: 0.9683, f1: 0.9583, precision: 0.9787, recall: 0.9388



In [16]:
tokenizer.save_pretrained(SAVE_DIRECTORY)

('./models/self_model_multi\\tokenizer_config.json',
 './models/self_model_multi\\special_tokens_map.json',
 './models/self_model_multi\\vocab.txt',
 './models/self_model_multi\\added_tokens.json')

In [17]:
test_texts = test['text'].tolist()
test_labels = test['label'].tolist()

In [18]:
def classify_text(model, tokenizer, text, threshold):
    model.eval()
    with torch.no_grad():
        tokenized_value = tokenizer(text,  padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        output_value = model(tokenized_value['input_ids'].to(device), attention_mask=tokenized_value['attention_mask'].to(device))
        predictions = output_value[0].cpu().numpy()
        output_value = predictions > threshold
        return output_value.astype(int).tolist(), predictions.tolist()

In [19]:
counter = 0
results = []
for test_text, test_label in zip(test_texts, test_labels):
    inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    outputs, predictions = classify_text(model, tokenizer, test_text, THRESHOLD)
    results.append(outputs)
    if outputs != test_label:
        counter += 1
        print(f"Текст: {test_text}")
        print(f"Предсказано: {outputs}, Значение: {test_label}, Результат: {predictions}")
        print()
print(f"Всего ошибочно: {counter}")

print(', '.join(f"{key}: {value:.4f}" for key, value in metric.compute(predictions=np.array(results).reshape(-1), references=np.array(test_labels).reshape(-1)).items()))

Текст: Сколько времени?
Предсказано: [0, 0, 0], Значение: [0, 0, 1], Результат: [0.012759151868522167, 0.3429080545902252, 0.7172077894210815]

Всего ошибочно: 1
accuracy: 0.9921, f1: 0.9901, precision: 1.0000, recall: 0.9804
