# Цель.

Решить задачу классификации с помощью коробочного решения, основанного на трансформерах.

# Подготовка данных

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [2]:
tab = pd.read_csv('file.csv', sep=';')

In [3]:
from processing import y_dict
CLASSES = ["", 'Бизнес-карта', 'Зарплатные проекты', 'Открытие банковского счета', 'Эквайринг']

In [4]:
tab.reset_index(inplace=True)
tab['text'] = tab['text_employer']
tab['category'] = tab['ACTION_ITEM_RESULT_PRODUCT_NAME']
tab = tab[['text', 'category']]
tab['id'] = tab.index

# Определение структуры трансформеров (BERT).

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_cosine_schedule_with_warmup, AdamW

In [6]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, phase='test'):
        self.phase = phase
        
        if self.phase == 'train':
            self.labels = [labels[label] for label in df['category']]
        elif self.phase == 'test':
            self.id = [oid for oid in df['id']]
        self.texts = [tokenizer(text, 
                                padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        if self.phase == 'train':
            return len(self.labels)
        elif self.phase == 'test':
            return len(self.id)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_oid(self, idx):
        return np.array(self.id[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        if self.phase == 'train':
            batch_texts = self.get_batch_texts(idx)
            batch_y = self.get_batch_labels(idx)
            return batch_texts, batch_y
        elif self.phase == 'test':
            batch_texts = self.get_batch_texts(idx)
            batch_oid = self.get_batch_oid(idx)
            return batch_texts, batch_oid
   

In [7]:
class BertClassifier:
    def __init__(self, model_path, tokenizer_path, data, n_classes=13, epochs=5):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.data = data
        self.device = torch.device('cuda')
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes).cuda()
        self.model = self.model.cuda()

    
    def preparation(self):
        self.df_train, self.df_val, self.df_test = np.split(self.data.sample(frac=1, random_state=42), 
                                     [int(.85*len(self.data)), int(.95*len(self.data))])
        
        self.train, self.val = CustomDataset(self.df_train, self.tokenizer, phase='train'), CustomDataset(self.df_val, self.tokenizer, phase='train')
        self.train_dataloader = torch.utils.data.DataLoader(self.train, batch_size=4, shuffle=True)
        self.val_dataloader = torch.utils.data.DataLoader(self.val, batch_size=4)
    
       
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_cosine_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_dataloader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().cuda()
            
    def fit(self):
        self.model = self.model.train()
        
        for epoch_num in range(self.epochs):
            total_acc_train = 0
            total_loss_train = 0
            for train_input, train_label in tqdm(self.train_dataloader):
                train_label = train_label.cuda()
                mask = train_input['attention_mask'].cuda()
                input_id = train_input['input_ids'].squeeze(1).cuda()
                output = self.model(input_id.cuda(), mask.cuda())

                batch_loss = self.loss_fn(output[0], train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output[0].argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                self.model.zero_grad()
                batch_loss.backward()
                self.optimizer.step()
                self.scheduler.step()
            total_acc_val, total_loss_val = self.eval()
           
            print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(self.df_train): .3f} \
            | Train Accuracy: {total_acc_train / len(self.df_train): .3f} \
            | Val Loss: {total_loss_val / len(self.df_val): .3f} \
            | Val Accuracy: {total_acc_val / len(self.df_val): .3f}')

            
            os.makedirs('checkpoint', exist_ok=True)
            torch.save(self.model, f'checkpoint/BertClassifier{epoch_num}.pt')

        return total_acc_train, total_loss_train
    
    def eval(self):
        self.model = self.model.eval()
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in tqdm(self.val_dataloader):
                val_label = val_label.cuda()
                mask = val_input['attention_mask'].cuda()
                input_id = val_input['input_ids'].squeeze(1).cuda()

                output = self.model(input_id.to('cuda'), mask.to('cuda'))

                batch_loss = self.loss_fn(output[0], val_label.long())
                total_loss_val += batch_loss.item()

                acc = (output[0].argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
            
        return total_acc_val, total_loss_val

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_val, test = train_test_split(tab, random_state=239, test_size=0.2)

# Применение трансформеров.

In [10]:
model_path = 'cointegrated/rubert-tiny2'
tokenizer_path = 'cointegrated/rubert-tiny2'
bert_tiny = BertClassifier(model_path, tokenizer_path, train_val, epochs=10)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

Мы выбираем не большой BERT, а его дистиллированную и обработанную версию, чтобы получить результаты побыстрее, пожертвовав, возможно точностью.

In [11]:
labels = y_dict

In [12]:
bert_tiny.preparation()



In [13]:
bert_tiny.fit()

100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [01:17<00:00, 11.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 47.18it/s]


Epochs: 1 | Train Loss:  0.200             | Train Accuracy:  0.693             | Val Loss:  0.154             | Val Accuracy:  0.773


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [01:07<00:00, 13.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 47.13it/s]


Epochs: 2 | Train Loss:  0.138             | Train Accuracy:  0.775             | Val Loss:  0.145             | Val Accuracy:  0.771


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [01:07<00:00, 13.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 46.92it/s]


Epochs: 3 | Train Loss:  0.115             | Train Accuracy:  0.825             | Val Loss:  0.148             | Val Accuracy:  0.761


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [01:11<00:00, 13.01it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:01<00:00, 55.93it/s]


Epochs: 4 | Train Loss:  0.090             | Train Accuracy:  0.867             | Val Loss:  0.157             | Val Accuracy:  0.761


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [00:58<00:00, 15.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:01<00:00, 55.86it/s]


Epochs: 5 | Train Loss:  0.067             | Train Accuracy:  0.905             | Val Loss:  0.167             | Val Accuracy:  0.764


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [00:58<00:00, 15.87it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:01<00:00, 56.56it/s]


Epochs: 6 | Train Loss:  0.051             | Train Accuracy:  0.932             | Val Loss:  0.175             | Val Accuracy:  0.773


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [00:58<00:00, 15.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:01<00:00, 55.83it/s]


Epochs: 7 | Train Loss:  0.039             | Train Accuracy:  0.950             | Val Loss:  0.185             | Val Accuracy:  0.757


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [00:59<00:00, 15.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 54.36it/s]


Epochs: 8 | Train Loss:  0.032             | Train Accuracy:  0.960             | Val Loss:  0.200             | Val Accuracy:  0.741


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [00:59<00:00, 15.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 53.92it/s]


Epochs: 9 | Train Loss:  0.028             | Train Accuracy:  0.965             | Val Loss:  0.197             | Val Accuracy:  0.745


100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [01:00<00:00, 15.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 54.02it/s]


Epochs: 10 | Train Loss:  0.026             | Train Accuracy:  0.968             | Val Loss:  0.198             | Val Accuracy:  0.745


(3587, 96.23585230961908)

Таким образом, на валидационном множестве получается порядка 76.1%

Так как у catboost, и у трансформеров разные валидационные множества сделаем сравнение на тестовом множестве.

In [14]:
test_dataset = CustomDataset(test, bert_tiny.tokenizer, phase='test')
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [15]:
def inference(model, dataloader):
    all_oid = []
    all_labels = []
    label_prob = []
    
    model.cuda()
    model.eval()
    with torch.no_grad():
        for test_input, test_oid in tqdm(dataloader):
            test_oid = test_oid.cuda()
            mask = test_input['attention_mask'].cuda()
            input_id = test_input['input_ids'].squeeze(1).cuda()
            output = model(input_id, mask)
            all_oid.extend(test_oid)
            all_labels.extend(torch.argmax(output[0].softmax(1), dim=1))
            
            for prob in output[0].softmax(1):
                label_prob.append(prob)
        return ([oid.item() for oid in all_oid], [CLASSES[labels] for labels in all_labels], label_prob)

In [16]:
inference_model = torch.load('./checkpoint/BertClassifier2.pt')

In [17]:
inference_result = inference(inference_model, test_dataloader)

100%|████████████████████████████████████████████████████████████████████████████████| 273/273 [00:04<00:00, 56.52it/s]


In [18]:
new_labels = [i for i in inference_result[1]]

In [19]:
len(test[test['category'] == new_labels])/len(test)

0.7568807339449541

Итак, 75.69%

Результат получился сравнимый с catboost, но всё-таки похуже.