# Классификация текста с помощью трансформера BERT

Оригинальная идея подчерпнута отсюда https://www.kaggle.com/c/learn-ai-bbc и отсюда https://habr.com/ru/post/655517/


### Импорт библиотек

In [None]:
!pip install torch
!pip install transformers[torch]

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

class BERT_Arch(nn.Module):
    def __init__(self, bert, num_classes = 96):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,num_classes)
        self.softmax = nn.LogSoftmax(dim = 1)
    
    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask = mask, return_dict = False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

class TansformerRuBERT:
    def __init__(self, weights = 'saved_weights_2_digits.pt', dev='cpu'):
        # dev='cuda'
        self.device = torch.device(dev)
        self.bert = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
        self.tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
        for param in self.bert.parameters():
            param.requires_grad = False
        self.model = BERT_Arch(self.bert)
        self.model = self.model.to(self.device)
        self.model.load_state_dict(torch.load(weights, map_location=torch.device(self.device)))
        self.model.eval()
        
    def predict(self, line):
        
        sequence = self.tokenizer.encode(line, 
                                        max_length = 15, 
                                        padding = 'max_length',
                                        truncation = True)
        mask = torch.tensor([1]*len(sequence)).to(self.device)
        sequence = torch.tensor(sequence).to(self.device)
        mask = torch.unsqueeze(mask, 0)
        sequence = torch.unsqueeze(sequence, 0)
        res = self.model(sequence, mask)
        res = int(res.argmax(dim=1).cpu().numpy())
        if res> 77:
            return res+2
        else:
            return res+1
    
myModel = TansformerRuBERT()


input_line = 'изделия прочие пластмасс изделия прочих материалов товарных позиций 3901 3914 прочие прочие прочие прочие' 
res = myModel.predict(input_line)


print(res)

  from .autonotebook import tqdm as notebook_tqdm


39
