In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import pandas as pd
import numpy as np

In [2]:
class IntentDataset(Dataset):

  def __init__(self, csv_path, tokenizer, classes=None):
    df = pd.read_csv(csv_path, sep=";")
    if classes is None:
        self.classes = sorted(df.intent.unique())
    else:
        self.classes = classes

    self.id2class = {i: x for i, x in enumerate(self.classes)}
    self.class2id = {x: i for i, x in enumerate(self.classes)}

    self.labels = [self.class2id[x] for x in df.intent]
    self.objects = list(df.text)
    self.num_classes = len(self.classes)

    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.objects)

  def __getitem__(self, i):
    text = str(self.objects[i])

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(self.labels[i], dtype=torch.long)
    }

In [3]:
epochs = 50

tokenizer = BertTokenizer.from_pretrained("cointegrated/rubert-tiny")

train_set = IntentDataset("train.csv", tokenizer)
train_loader = DataLoader(train_set, batch_size=3, shuffle=True)

valid_set = IntentDataset("test.csv", tokenizer, train_set.classes)
valid_loader = DataLoader(valid_set, batch_size=1)


model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny", num_labels=train_set.num_classes)
model = model.cuda()
model = model.train()

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss_function = torch.nn.CrossEntropyLoss()

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

In [4]:
train_set.num_classes, len(train_set)

(7, 47)

In [5]:
def train_epoch(model, device='cuda'):
    losses = []
    correct_predictions = 0

    for data in train_loader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        targets = data["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )

        preds = torch.argmax(outputs.logits, dim=1)
        loss = loss_function(outputs.logits, targets)

        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    train_acc = correct_predictions.double() / len(train_set)
    train_loss = np.mean(losses)
    return train_acc, train_loss


def eval_epoch(model, device='cuda'):
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for data in valid_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            targets = data["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = loss_function(outputs.logits, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    val_acc = correct_predictions.double() / len(valid_set)
    val_loss = np.mean(losses)
    return val_acc, val_loss

In [6]:
best_accuracy = 0
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    model = model.train()
    train_acc, train_loss = train_epoch(model)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    model = model.eval()
    val_acc, val_loss = eval_epoch(model)
    print(f'Val loss {val_loss} accuracy {val_acc}')
    print('-' * 10)

Epoch 1/50
Train loss 1.9179854094982147 accuracy 0.2127659574468085
Val loss 1.8382936205182756 accuracy 0.2857142857142857
----------
Epoch 2/50
Train loss 1.7026153728365898 accuracy 0.3829787234042553
Val loss 1.6636668954576765 accuracy 0.2857142857142857
----------
Epoch 3/50
Train loss 1.4779823049902916 accuracy 0.5106382978723404
Val loss 1.5360872064317976 accuracy 0.2857142857142857
----------
Epoch 4/50
Train loss 1.242090530693531 accuracy 0.6808510638297872
Val loss 1.4106640688010625 accuracy 0.42857142857142855
----------
Epoch 5/50
Train loss 1.043381068855524 accuracy 0.8936170212765957
Val loss 1.3337423545973641 accuracy 0.5714285714285714
----------
Epoch 6/50
Train loss 0.8901304490864277 accuracy 0.9574468085106382
Val loss 1.165014433009284 accuracy 0.42857142857142855
----------
Epoch 7/50
Train loss 0.7554843481630087 accuracy 1.0
Val loss 1.0795178839138575 accuracy 0.5714285714285714
----------
Epoch 8/50
Train loss 0.6312139257788658 accuracy 1.0
Val loss 0

In [7]:
model = model.eval()
model.id2class = train_set.id2class
model.tokenizer = tokenizer

def inference(text, device='cuda'):
    encoding = model.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask).logits.detach().cpu()

    return model.id2class[torch.argmax(out, dim=1).item()]


In [8]:
for i in valid_set:
    print(i['text'], inference(i['text']))

Нужно сделать аринжировку arrangement
У меня есть тексти и ноты, можно у вас записать вокал? recording
Хотим приятный дизайн обложки album_cover
Хотим чтобы новый трек был на радио promotion
Есть хорошая демка, хочу в скором времени выпустить её на спотифай release
Почему так дорого trade
Добрый день other


In [9]:
torch.save(model, r"weights/best.pt")