# Обучение и предсказание Типа обращения и Типа переклассификации при помощи RuBert

В качестве предобученной модели используется `DeepPavlov/rubert-base-cased-sentence`, которая дообучается на наших данных на 5 эпохах

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


Создание класса `Dataset'а` и самого классификатора, а также функции для обучения, валидации и предсказания

In [1]:
import transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch
import nltk
import warnings
import string
import re
import random
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

warnings.filterwarnings('ignore')

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')
model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def seed(value):
    random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    
class CBCaseDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }
    
class CBCaseClassifier(nn.Module):
    def __init__(self, n_classes):
        super(CBCaseClassifier, self).__init__()
        self.bert = model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        
        return self.out(self.drop(last_hidden_state_cls))
    
def create_train_dataloader(X_data, y_data, tokenizer, batch_size, max_len):
    dataset = CBCaseDataset(
        texts=X_data,
        targets=y_data,
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        dataset,
        sampler = RandomSampler(dataset),
        batch_size=batch_size,
    )

def create_test_dataloader(X_data, tokenizer, batch_size, max_len):
    dataset = CBCaseDataset(
        texts=X_data,
        targets=[0] * len(X_data),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        dataset,
        sampler = SequentialSampler(dataset),
        batch_size=batch_size,
    )

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader, desc='TRAIN'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

@torch.no_grad()
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader, desc='EVALUATION'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())
        
    return correct_predictions.double() / n_examples, np.mean(losses)

@torch.no_grad()
def get_predictions(model, data_loader):
    model.eval()
    
    predictions = []
    prediction_probs = []
    real_values = []
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)

        outputs = F.softmax(model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ))
        _, preds = torch.max(outputs, dim=1)

        predictions.extend(preds)
        prediction_probs.extend(outputs)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    
    return predictions, prediction_probs

def training(data, test_data, main_feature, target_feature, batch_size, epochs, learning_rate_optimizer, n_classes):
    seed(42)

    sample = data.sample(frac=1, random_state=42)
    train_data = sample.iloc[data.shape[0] // 100 * 8:]
    valid_data = sample.iloc[:data.shape[0] // 100 * 8]
    
    X_train = train_data[main_feature].values
    y_train = train_data[target_feature].values
    
    X_valid = valid_data[main_feature].values
    y_valid = valid_data[target_feature].values

    X_test = test_data[main_feature].values

    train_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in X_train]
    valid_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in X_valid]
    test_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in X_test]

    train_max_len = max(map(len, train_tokenized))
    valid_max_len = max(map(len, valid_tokenized))
    test_max_len = max(map(len, test_tokenized))

    train_data_loader = create_train_dataloader(X_train, y_train, tokenizer, batch_size, train_max_len)
    valid_data_loader = create_test_dataloader(X_valid, tokenizer, batch_size, valid_max_len)
    test_data_loader = create_test_dataloader(X_test, tokenizer, batch_size, test_max_len)

    model = CBCaseClassifier(n_classes)
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate_optimizer, correct_bias=False)

    total_steps = len(train_data_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    loss_fn = nn.CrossEntropyLoss().to(device)

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1:2d}/{epochs:2d}')
        print('-' * 25)

        train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(X_train))
        valid_acc, valid_loss = eval_model(model, valid_data_loader, loss_fn, device, len(X_valid))
        
        print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
        print(f'Valid loss {valid_loss:.4f} accuracy {valid_acc:.4f}')

    predicted_valid_labels, prediction_probs_valid = get_predictions(model, valid_data_loader)
    predicted_test_labels, prediction_probs_test = get_predictions(model, test_data_loader)

    print('Результаты:')
    print(classification_report(y_valid,
                                predicted_valid_labels,
                                target_names=['Запрос', 'Инцидент']))
    
    print('Результаты с порогом в 0.35:')
    print(classification_report(y_valid, 
                                np.array(prediction_probs_valid[:, 1] > 0.35).astype(int),
                                target_names=['Запрос', 'Инцидент']))

    return predicted_test_labels, prediction_probs_test

Чтение данных

In [2]:
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'

In [3]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

Предобработка данных

In [4]:
TYPE_DICT = {'Запрос': 0, 'Инцидент': 1}

def lowercasing(text: str) -> str:
    return text.lower()

def delete_punctuation(text: str) -> str:
    return re.sub(r'[^\w\s]','', text)

    return new_text

def preprocess(df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
    if is_train:
        df['Тип обращения итоговый'] = df['Тип обращения итоговый'].map(TYPE_DICT)
    
    df['Содержание'] = df['Содержание'].fillna('нет данных')
    df['Содержание'] = df['Содержание'].apply(lowercasing)
    df['Содержание'] = df['Содержание'].apply(delete_punctuation)

    return df

In [5]:
train = preprocess(train)
test = preprocess(test, False)

Запуск обучения на 5 эпохах и вывод результатов по метрикам

In [6]:
predicted_labels, prediction_probs = training(data=train,
                                              test_data=test,
                                              main_feature='Содержание',
                                              target_feature='Тип обращения итоговый',
                                              batch_size=32,
                                              epochs=5,
                                              learning_rate_optimizer=2e-5,
                                              n_classes=2)

Epoch  1/ 5
-------------------------


TRAIN: 100%|██████████| 426/426 [01:22<00:00,  5.17it/s]
EVALUATION: 100%|██████████| 37/37 [00:02<00:00, 18.29it/s]


Train loss 0.0783 accuracy 0.9678
Valid loss 0.0377 accuracy 0.9882
Epoch  2/ 5
-------------------------


TRAIN: 100%|██████████| 426/426 [01:21<00:00,  5.24it/s]
EVALUATION: 100%|██████████| 37/37 [00:02<00:00, 18.12it/s]


Train loss 0.0464 accuracy 0.9818
Valid loss 0.0586 accuracy 0.9764
Epoch  3/ 5
-------------------------


TRAIN: 100%|██████████| 426/426 [01:21<00:00,  5.24it/s]
EVALUATION: 100%|██████████| 37/37 [00:02<00:00, 13.66it/s]


Train loss 0.0291 accuracy 0.9904
Valid loss 0.0745 accuracy 0.9848
Epoch  4/ 5
-------------------------


TRAIN: 100%|██████████| 426/426 [01:20<00:00,  5.31it/s]
EVALUATION: 100%|██████████| 37/37 [00:02<00:00, 18.17it/s]


Train loss 0.0230 accuracy 0.9922
Valid loss 0.0927 accuracy 0.9806
Epoch  5/ 5
-------------------------


TRAIN: 100%|██████████| 426/426 [01:21<00:00,  5.20it/s]
EVALUATION: 100%|██████████| 37/37 [00:02<00:00, 15.72it/s]


Train loss 0.0191 accuracy 0.9933
Valid loss 0.0893 accuracy 0.9823
Результаты:
              precision    recall  f1-score   support

      Запрос       0.99      1.00      0.99      1154
    Инцидент       0.86      0.60      0.71        30

    accuracy                           0.99      1184
   macro avg       0.92      0.80      0.85      1184
weighted avg       0.99      0.99      0.99      1184

Результаты с порогом в 0.35:
              precision    recall  f1-score   support

      Запрос       0.99      1.00      0.99      1154
    Инцидент       0.83      0.63      0.72        30

    accuracy                           0.99      1184
   macro avg       0.91      0.81      0.86      1184
weighted avg       0.99      0.99      0.99      1184



Создание submission-файлов

In [7]:
submission = pd.read_csv('submission.csv')

In [8]:
submission.head()

Unnamed: 0,id,Тип переклассификации,Тип обращения итоговый
0,14803,,
1,14804,,
2,14805,,
3,14806,,
4,14807,,


In [9]:
REV_TYPE_DICT = {0: 'Запрос', 1: 'Инцидент'}

def make_submisssion_file(df_subm: pd.DataFrame, df_test: pd.DataFrame, results: np.array, file_path: str) -> pd.DataFrame:
    df_subm['Тип обращения итоговый'] = results
    df_subm['Тип обращения итоговый'] = df_subm['Тип обращения итоговый'].map(REV_TYPE_DICT)

    df_subm_m = df_subm.merge(df_test[['id', 'Тип обращения на момент подачи']], on='id')

    df_subm_m.loc[df_subm_m['Тип обращения итоговый'] == df_subm_m['Тип обращения на момент подачи'], 
                 'Тип переклассификации'] = 0
    df_subm_m.loc[(df_subm_m['Тип обращения итоговый'] == 'Инцидент') &
                  (df_subm_m['Тип обращения на момент подачи'] == 'Запрос'), 
                  'Тип переклассификации'] = 1
    df_subm_m.loc[(df_subm_m['Тип обращения итоговый'] == 'Запрос') &
                  (df_subm_m['Тип обращения на момент подачи'] == 'Инцидент'), 
                  'Тип переклассификации'] = 2

    df_subm_m['Тип переклассификации'] = df_subm_m['Тип переклассификации'].astype(int)

    df_subm = df_subm_m[['id', 'Тип переклассификации', 'Тип обращения итоговый']]
    display(df_subm['Тип переклассификации'].value_counts())

    df_subm.to_csv(file_path, index=False)

    return df_subm

In [10]:
submission_wo_thr = make_submisssion_file(submission, 
                                          test, 
                                          np.array(predicted_labels), 
                                          'submission_bert_wo_threshold.csv')

0    5946
2     362
1      36
Name: Тип переклассификации, dtype: int64

In [11]:
submission_w_thr = make_submisssion_file(submission, 
                                          test, 
                                          np.array(prediction_probs[:, 1] > 0.35).astype(int), 
                                          'submission_bert_w_threshold_0_35.csv')

0    5947
2     359
1      38
Name: Тип переклассификации, dtype: int64