In [1]:
import evaluate
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification)
from torch.optim import AdamW
from torch.nn import BCEWithLogitsLoss
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
DATASET_NAME = "./datasets/ru-plus.csv"
TEST_SIZE = 0.3
MODEL_NAME = "ai-forever/sbert_large_nlu_ru"
SAVE_DIRECTORY = "./models/self_multi"

In [3]:
df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ["text", "label"]
df_label_0 = df[df["label"] == 0]
df_label_1 = df[df["label"] == 1]
multi_class = []
for i in range(50):
    row_0 = df_label_0.sample(n=1, random_state=np.random.RandomState())
    row_1 = df_label_1.sample(n=1, random_state=np.random.RandomState())
    text = f"{row_0['text'].values[0]}, {row_1['text'].values[0].lower()}"
    multi_class.append({'text': text, 'label': 3})

df = pd.concat([df, pd.DataFrame(multi_class)], ignore_index=True)
df['label'] = df['label'].map({0: [1, 0, 0], 1: [0, 1, 0], 2: [0, 0, 1], 3: [1, 1, 0]})

In [4]:
train, test_valid = train_test_split(df, test_size=TEST_SIZE, shuffle=True)
test, valid = train_test_split(test_valid, test_size=0.5)

In [16]:
THRESHOLD = 0.9
LEARNING_RATE = 1e-5
BATCH_SIZE = 16
NUM_EPOCHS = 16

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    problem_type="multi_label_classification",
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = BCEWithLogitsLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [19]:
train_texts = train['text'].tolist()
train_labels = train['label'].tolist()
dataset = TextDataset(train_texts, train_labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

valid_texts = valid['text'].tolist()
valid_labels = valid['label'].tolist()
valid_dataset = TextDataset(valid_texts, valid_labels)
validation_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

In [20]:
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def predict(val):
    return (val > THRESHOLD).astype(int).reshape(-1)

def compute_metrics(predictions, references):
    return metric.compute(predictions=np.array(predictions).astype(bool).reshape(-1), references=np.array(references).astype(bool).reshape(-1))

In [21]:
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].float().to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            
            loss = criterion(outputs.logits, labels)
            total_loss += loss.item()
            
            preds = torch.sigmoid(outputs.logits).cpu().numpy() > THRESHOLD
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    return total_loss / len(dataloader), compute_metrics(all_preds, all_labels)

In [22]:
for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    model.train()
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
         
    validation_loss, scores = validate(model, validation_dataloader, device)
    tqdm.write(f"Training Loss: {loss.item():.4f}, Validation Loss: {validation_loss:.4f}, {', '.join(f"{key}: {value:.4f}" for key, value in scores.items())}")
    print()

Epoch 1/16


Training: 100%|██████████| 13/13 [00:01<00:00,  7.64it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Loss: 0.5787, Validation Loss: 0.5001, accuracy: 0.6032, f1: 0.0000, precision: 0.0000, recall: 0.0000

Epoch 2/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.80it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Loss: 0.4453, Validation Loss: 0.3895, accuracy: 0.6032, f1: 0.0000, precision: 0.0000, recall: 0.0000

Epoch 3/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.82it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Loss: 0.2687, Validation Loss: 0.2985, accuracy: 0.6032, f1: 0.0000, precision: 0.0000, recall: 0.0000

Epoch 4/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.61it/s]


Training Loss: 0.1630, Validation Loss: 0.2096, accuracy: 0.6667, f1: 0.2759, precision: 1.0000, recall: 0.1600

Epoch 5/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.57it/s]


Training Loss: 0.1165, Validation Loss: 0.1542, accuracy: 0.7302, f1: 0.4848, precision: 1.0000, recall: 0.3200

Epoch 6/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.56it/s]


Training Loss: 0.0861, Validation Loss: 0.1114, accuracy: 0.8571, f1: 0.7805, precision: 1.0000, recall: 0.6400

Epoch 7/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.55it/s]


Training Loss: 0.0692, Validation Loss: 0.0796, accuracy: 0.9444, f1: 0.9247, precision: 1.0000, recall: 0.8600

Epoch 8/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.61it/s]


Training Loss: 0.0609, Validation Loss: 0.0675, accuracy: 0.9762, f1: 0.9691, precision: 1.0000, recall: 0.9400

Epoch 9/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.51it/s]


Training Loss: 0.0465, Validation Loss: 0.0645, accuracy: 0.9841, f1: 0.9796, precision: 1.0000, recall: 0.9600

Epoch 10/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.78it/s]


Training Loss: 0.0435, Validation Loss: 0.0473, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800

Epoch 11/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.83it/s]


Training Loss: 0.0349, Validation Loss: 0.0398, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800

Epoch 12/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.39it/s]


Training Loss: 0.0292, Validation Loss: 0.0332, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800

Epoch 13/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.60it/s]


Training Loss: 0.0264, Validation Loss: 0.0313, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800

Epoch 14/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.73it/s]


Training Loss: 0.0240, Validation Loss: 0.0284, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800

Epoch 15/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.72it/s]


Training Loss: 0.0234, Validation Loss: 0.0256, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800

Epoch 16/16


Training: 100%|██████████| 13/13 [00:01<00:00,  8.46it/s]


Training Loss: 0.0193, Validation Loss: 0.0237, accuracy: 0.9921, f1: 0.9899, precision: 1.0000, recall: 0.9800



In [23]:
tokenizer.save_pretrained(SAVE_DIRECTORY)
model.save_pretrained(SAVE_DIRECTORY)

In [24]:
from transformers import pipeline

MODEL_TASK = "sentiment-analysis"
classifier = pipeline(MODEL_TASK, model=model, tokenizer=tokenizer, top_k=3)
texts = test['text'].tolist()
labels = test['label'].tolist()

results = classifier(texts)

In [25]:
counter = 0
mapped_results = []

for text, label, result in zip(texts, labels, results):
    res = {el['label']: 1 if el['score'] > THRESHOLD else 0 for el in result}
    mapped = [res['LABEL_0'], res['LABEL_1'], res['LABEL_2']]
    mapped_results.append(mapped)
    if mapped != label:
        counter += 1
        print(f"Текст: {text}")
        print(f"Предсказано: {mapped}, Значение: {label}, Результат: {result}")
        print()
        
print(f"Всего ошибочно: {counter}")

Текст: Как насчжёт кино вечером?
Предсказано: [0, 0, 0], Значение: [0, 0, 1], Результат: [{'label': 'LABEL_2', 'score': 0.8348129391670227}, {'label': 'LABEL_1', 'score': 0.40166231989860535}, {'label': 'LABEL_0', 'score': 0.0037557545583695173}]

Текст: Нормально?
Предсказано: [0, 0, 0], Значение: [0, 0, 1], Результат: [{'label': 'LABEL_1', 'score': 0.7760507464408875}, {'label': 'LABEL_2', 'score': 0.2226688712835312}, {'label': 'LABEL_0', 'score': 0.007004767190665007}]

Всего ошибочно: 2


In [26]:
', '.join(f"{key}: {value:.4f}" for key, value in metric.compute(predictions=np.array(mapped_results).reshape(-1), references=np.array(labels).reshape(-1)).items())

'accuracy: 0.9841, f1: 0.9792, precision: 1.0000, recall: 0.9592'