In [56]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [10]:
class CustomDataset(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [11]:
class BertClassifier:

    def __init__(self, model_path, tokenizer_path, n_classes=2, epochs=1, model_save_path='/content/bert.pt'):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)

    def preparation(self, X_train, y_train, X_valid, y_valid):
        # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

        # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=2, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=2, shuffle=True)

        # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)

    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in self.train_loader:
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss

    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

        val_acc = correct_predictions.double() / len(self.valid_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss

    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')

            val_acc, val_loss = self.eval()
            print(f'Val loss {val_loss} accuracy {val_acc}')
            print('-' * 10)

            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_acc

        self.model = torch.load(self.model_save_path)

    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }

        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)

        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )

        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

## Подгружаем логи

In [39]:
df = pd.read_csv('/content/df.csv')
df.head()

Unnamed: 0,Тема,label
0,Утром опять завис и рестартнул контейнер аттач...,10
1,Задержка в выборке писем из ящика после простоев,10
2,Проблема с отображением Related Tickets,10
3,Некорректное отображение кнопок UI-action на ф...,6
4,GRPC Server is unavailable,10


In [40]:
train_data, val_data, test_data = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

In [64]:
model_path = 'cointegrated/rubert-tiny'
tokenizer_path = 'cointegrated/rubert-tiny'
n_classes = df['label'].nunique()
epochs=20
model_save_path='/content/bert.pt'

In [65]:
classifier = BertClassifier(
        model_path=model_path,
        tokenizer_path=tokenizer_path,
        n_classes=n_classes,
        epochs=epochs,
        model_save_path=model_save_path
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
classifier.preparation(
        X_train=list(train_data['Тема']),
        y_train=list(train_data['label']),
        X_valid=list(val_data['Тема']),
        y_valid=list(val_data['label'])
    )




In [67]:
classifier.train()

Epoch 1/20
Train loss 1.337244370335562 accuracy 0.6809470124013529
Val loss 0.9718160103704478 accuracy 0.7905405405405406
----------
Epoch 2/20
Train loss 0.8661147346510342 accuracy 0.8105975197294251
Val loss 0.7451069305455815 accuracy 0.8682432432432432
----------
Epoch 3/20
Train loss 0.6117439213820297 accuracy 0.8478015783540023
Val loss 0.6116389734387347 accuracy 0.8817567567567568
----------
Epoch 4/20
Train loss 0.4163380428411377 accuracy 0.8906426155580609
Val loss 0.6151809416667281 accuracy 0.8885135135135135
----------
Epoch 5/20
Train loss 0.3596540912482384 accuracy 0.910935738444194
Val loss 0.5542054603960227 accuracy 0.902027027027027
----------
Epoch 6/20
Train loss 0.24650081700665313 accuracy 0.9357384441939121
Val loss 0.4974125379065643 accuracy 0.9155405405405406
----------
Epoch 7/20
Train loss 0.17713960196373477 accuracy 0.9616685456595265
Val loss 0.49206829936293933 accuracy 0.9121621621621622
----------
Epoch 8/20
Train loss 0.1317656191502727 accurac

Запускаем на тестовой выборке

In [68]:
texts = list(test_data['Тема'])
labels = list(test_data['label'])

predictions = [classifier.predict(t) for t in texts]

In [69]:
test_data['pred'] = predictions

In [70]:
test_data.head()

Unnamed: 0,Тема,label,pred
546,Дата в устройствах Apple,10,10
658,Не отображается содержимое шапки,10,10
1208,не работает дев стенд. ошибка Request failed w...,5,5
1019,При создании записей в таблице itsm_request не...,10,10
1469,Пропадает обязательность поля,8,8


In [71]:
print(f'accuracy: {accuracy_score(labels, predictions)}')

accuracy: 0.9324324324324325


In [72]:
precision, recall, f1score = precision_recall_fscore_support(labels, predictions,average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: 0.9366252635961908, recall: 0.9038239538239536, f1score: 0.9107628962980753
