In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [4]:
data = pd.read_csv('eth_group_lemm_text.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,document.id,lemm_text,eth_group_to_code,is_ethicity_superior_meaning,is_ethicity_aggressor_meaning,is_ethicity_dangerous_meaning
0,0,885072939,грузин mia бред написать какой русский вообще ...,грузин,irrel,irrel,no
1,1,885072939,грузин mia бред написать какой русский вообще ...,грузин,irrel,irrel,irrel
2,2,885072939,грузин mia бред написать какой русский вообще ...,грузин,irrel,irrel,yes
3,10,885072939,грузин mia бред написать какой русский вообще ...,грузин,irrel,agressor,no
4,11,885072939,грузин mia бред написать какой русский вообще ...,грузин,irrel,agressor,irrel


In [5]:
import random

# Фиксирую сулчайность

def seed_all(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    
# seed_all()

In [6]:
from sklearn.preprocessing import LabelEncoder


# Добавляю столбцы с лейблами для каждой категории
def encode_columns(data, columns):
    encoders = []
    labels = []
    for column in columns:
        le = LabelEncoder()
        data[f'label_{column}'] = le.fit_transform(data[column].values)
        encoders.append(le)
        labels.append(f'label_{column}')
        
    return data, encoders, labels

column_to_encode = [
                    'is_ethicity_superior_meaning',
                   'is_ethicity_aggressor_meaning',
                   'is_ethicity_dangerous_meaning'
                    ]

data, encoders, labels = encode_columns(data, column_to_encode)

In [7]:
from torch.utils.data import DataLoader , Dataset
import nlpaug.augmenter.word
import nlpaug.augmenter.sentence

# Класс датасета для имплементации аугментации (она работает плохо)
class MyDataset(Dataset):
    def __init__(self, text, targets, document_ids, tokenizer=None, augment=False, augment_probs=None):
        self.text = text
        self.targets = targets
        self.document_ids = document_ids
        self.augment = augment
        self.tokenizer = tokenizer
#         p = 0.5
        if self.augment:
            p1, p2 = augment_probs
            self.augmenters = [
                nlpaug.augmenter.word.random.RandomWordAug(aug_p=p, action='delete', aug_max=None, stopwords=stopwords),
                nlpaug.augmenter.word.random.RandomWordAug(aug_p=p, action='swap', aug_max=None, stopwords=stopwords),
             ]
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text = str(self.text[idx])

        if self.augment:
            for aug in self.augmenters:
                text = aug.augment(text, n=1)[0]

    
        if self.tokenizer is not None:
    
            encoded_dict = tokenizer.encode_plus(
                                text,                      # Sentence to encode.
                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                max_length = 256,           # Pad & truncate all sentences.
                                pad_to_max_length = True,
                                return_attention_mask = True,   # Construct attn. masks.
                                return_tensors = 'pt',     # Return pytorch tensors.
                                padding="max_length" ,
                                truncation = True ,
                           )

            ids = encoded_dict['input_ids'][0]
        elif self.tokenizer is None:

            ids = text[idx]

        targets = self.targets[idx]
        return ids, targets, self.document_ids[idx]
    
stopwords = np.unique(data['eth_group_to_code'].values)

2023-06-14 23:53:43.810502: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
labels

['label_is_ethicity_superior_meaning',
 'label_is_ethicity_aggressor_meaning',
 'label_is_ethicity_dangerous_meaning']

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Загружаю токенизатор
checkpoint = 'cointegrated/rubert-tiny-toxicity'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [36]:
import torch
from torch.utils.data import random_split
from torch.utils.data import WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

seed = 11

# Инициализирую даталоадеры для обучения
def get_dataloaders(X, y, document_id, is_balanced=False,
                    augment=False, augment_probs=None, tokenizer=None, seed_value=42):
      
    train_idx, test_idx = train_test_split(np.arange(len(X)), test_size=0.15,
                                                        stratify=y[:, -1], random_state=seed_value)
    train_idx, val_idx = train_test_split(np.arange(len(X))[train_idx], test_size=0.15,
                                                          stratify=y[train_idx][:, -1], random_state=seed_value)

    if is_balanced:    
        ros = RandomOverSampler()
        indices = np.arange(len(train_idx)).reshape(-1, 1)
        indices_os, _ = ros.fit_resample(indices, np.array(y[train_idx][:, -1]))
        train_idx = indices_os.flatten()
    
    train_dataset = MyDataset(X[train_idx], y[train_idx], document_id[train_idx],
                              augment=augment, augment_probs=augment_probs, tokenizer=tokenizer)
    val_dataset = MyDataset(X[val_idx], y[val_idx], document_id[val_idx], tokenizer=tokenizer)
    test_dataset = MyDataset(X[test_idx], y[test_idx], document_id[test_idx], tokenizer=tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, num_workers=12)
    val_loader = DataLoader(val_dataset, batch_size=16, num_workers=12)
    test_loader = DataLoader(test_dataset, batch_size=16, num_workers=12)
        
    return train_loader, val_loader, test_loader
    
    
# df = pd.read_csv('data/is_ethicity_superior_meaning|is_ethicity_aggressor_meaning|is_ethicity_dangerous_meaning_eth_group_lemm_text.csv')
# data, encoders, labels = encode_columns(df, column_to_encode)
    
# train_loader, val_loader, test_loader = get_dataloaders(X=data['lemm_text'].values,
#                                                         y=torch.tensor(data[labels].values),
#                                                         document_id=torch.tensor(data['document.id'].values),
#                                                         is_balanced=False, augment=False, tokenizer=tokenizer,
#                                                         seed_value=1
#                                                        )


In [55]:
from torch import optim, nn, utils, Tensor
import pytorch_lightning as pl
import torch.nn as nn
from transformers import BertModel
from torcheval.metrics.functional import multiclass_f1_score

# Модель с bert слоем
class BERT(pl.LightningModule):
    def __init__(self, input_dim, labels, head_dims):
        super().__init__()     
#         checkpoint = 's-nlp/russian_toxicity_classifier'
#         checkpoint = 'cointegrated/rubert-tiny-toxicity'
        self.bert = BertModel.from_pretrained(checkpoint, return_dict=False)
#         self.fc1 = nn.Linear(input_dim, 256)

#         self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=2, batch_first=True, dropout=0.2)
        
        self.fc2 = nn.Linear(312, 256)
        
        self.heads = nn.ModuleList()
        self.loss_fns = []
        for i, label in enumerate(labels):
            head = nn.Linear(256, head_dims[i]).cuda()
            self.heads.append(head)
            
            loss_fn = nn.CrossEntropyLoss()
            self.loss_fns.append(loss_fn)
        
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ELU()
    
        self.labels = labels
        self.head_dims = head_dims
    
    def training_step(self, batch, batch_idx):
        x, y, _ = batch
#         label1, label2 = y[:, 0], y[:, 1]

        x = self.forward(x)
        
        losses = []
        for i, label in enumerate(self.labels):
            loss = self.loss_fns[i](x[i], y[:, i])
            self.log(f'train_loss_head_{i}', loss, prog_bar=True, on_step=False, on_epoch=True)
            losses.append(loss)
            
        loss = sum(losses)
        
        self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss
        
    def validation_step(self, batch, batch_idx):
        x, y, _ = batch
#         label1, label2 = y[:, 0], y[:, 1]
        
        x = self.forward(x)
        losses = []
        for i, label in enumerate(self.labels):
            loss = self.loss_fns[i](x[i], y[:, i])
            self.log(f'val_loss_head_{i}', loss, prog_bar=False, on_step=False, on_epoch=True)
            losses.append(loss)
            
        loss = sum(losses)
                
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)

        def log_f1_score(raw_preds, targets, name, num_classes):
            preds = torch.argmax(raw_preds, dim=-1)
            f1_acc = multiclass_f1_score(preds, targets, num_classes=num_classes, average='macro', )
            self.log(f'f1_score_{name}', f1_acc, prog_bar=True, on_step=False, on_epoch=True)
            return f1_acc
            
            
        scores = []
        for i, label in enumerate(self.labels): 
            f1_acc = log_f1_score(x[i], y[:, i], f'head_{i}', self.head_dims[i])
            scores.append(f1_acc)
            
        scores = torch.tensor(scores).mean()
        self.log(f'f1_score_mean', scores, prog_bar=False, on_step=False, on_epoch=True)
            
        
    def forward(self, x):
        _, x = self.bert(x)
        x = self.relu(self.dropout(x))
        x = self.relu(self.dropout(self.fc2(x)))
        
        return [head(x) for head in self.heads]

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=2e-5)
        return optimizer


In [62]:
scores

{'predictions': tensor([[2, 1, 1],
         [1, 0, 2],
         [0, 0, 2],
         ...,
         [1, 1, 1],
         [1, 2, 1],
         [1, 1, 1]]),
 'scores': [0.5709244457705759, 0.6011517651819198, 0.5874676119423218],
 'accuracy_scores': [0.7112630628306025,
  0.7473874338794995,
  0.8126693329892917]}

In [38]:
import os
# import warnings

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_float32_matmul_precision('high')
# warnings.filterwarnings("ignore") 

In [39]:
from sklearn.metrics import f1_score, accuracy_score

# Получаю предсказания
def get_predictions(model, test_loader):
    
    predictions = torch.zeros_like(test_loader.dataset.targets)
    
    for i, item in enumerate(test_loader.dataset):
        x, y, document_id = item
        preds = model(x.unsqueeze(0).cpu())
        predictions[i] = torch.cat([one_pred.argmax(1) for one_pred in preds])
        
    scores = []
    accuracy_scores = []
    for i in range(predictions.shape[1]):
        scores.append(f1_score(test_loader.dataset.targets[:, i], predictions[:, i], average='macro'))
        accuracy_scores.append(accuracy_score(test_loader.dataset.targets[:, i], predictions[:, i]))
        
    return {
        'predictions': predictions,
        'scores': scores,
        'accuracy_scores': accuracy_scores
    }


In [40]:
from IPython.display import clear_output
from imblearn.over_sampling import RandomOverSampler


In [63]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

all_results = []

for label in labels:
    one_head_results = []
    for seed_val in range(1):

        train_loader, val_loader, test_loader = get_dataloaders(X=data['lemm_text'].values,
                                                                y=torch.tensor(data[[labels[0]]].values),
                                                                document_id=torch.tensor(data['document.id'].values),
                                                                is_balanced=True,
                                                                augment=False,
                                                                tokenizer=tokenizer,
                                                                seed_value=seed_val
                                                               )

        early_stop_callback = EarlyStopping(monitor="f1_score_mean", min_delta=0.00,
                                            patience=2, verbose=False, mode="max")

        model = BERT(input_dim=0, labels=[labels[0]], head_dims=[3, 3, 3])
        trainer = pl.Trainer(max_epochs=10, callbacks=[early_stop_callback])
        trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

        model.eval()
    #     model.cuda()

        scores = get_predictions(model, test_loader)

        one_head_results.append(scores)
#         all_results[seed_val] = scores

#     clear_output(wait=True)
    all_results.append(one_head_results)
    break
    

Some weights of the model checkpoint at cointegrated/rubert-tiny-toxicity were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | bert    | BertModel  | 11.8 M
1 | fc2     | Linear     | 80.1 K
2 | heads   | ModuleList | 771   
3 | dropout | Drop

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [64]:
scores

{'predictions': tensor([[2],
         [2],
         [1],
         ...,
         [1],
         [1],
         [1]]),
 'scores': [0.6072354470732253],
 'accuracy_scores': [0.729325248355051]}

In [46]:

for head in range(3):
    f1_scores = []
    accuracy_scores = []
    for i in range(1):
        f1_scores.append(all_results[head][i]['scores'][0])
        accuracy_scores.append(all_results[head][i]['accuracy_scores'][0])
        
    f1_scores = np.round(f1_scores, 2).mean()
    accuracy_scores = np.round(accuracy_scores, 2).mean()
    print(f1_scores, accuracy_scores)


0.6 0.74
0.62 0.75
0.59 0.81


In [50]:
labels

['label_is_ethicity_superior_meaning',
 'label_is_ethicity_aggressor_meaning',
 'label_is_ethicity_dangerous_meaning']

In [58]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

all_results = []

for seed_val in range(1,2):

    train_loader, val_loader, test_loader = get_dataloaders(X=data['lemm_text'].values,
                                                            y=torch.tensor(data[labels].values),
                                                            document_id=torch.tensor(data['document.id'].values),
                                                            is_balanced=False,
                                                            augment=False,
                                                            tokenizer=tokenizer,
                                                            seed_value=seed_val
                                                           )

    early_stop_callback = EarlyStopping(monitor="f1_score_mean", min_delta=0.00,
                                        patience=2, verbose=False, mode="max")

    model = BERT(input_dim=0, labels=labels, head_dims=[3, 3, 3])
    trainer = pl.Trainer(max_epochs=10, callbacks=[early_stop_callback])
    trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

    model.eval()
#     model.cuda()

    scores = get_predictions(model, test_loader)
#         all_results[seed_val] = scores

#     clear_output(wait=True)
    all_results.append(scores)
    
    

Some weights of the model checkpoint at cointegrated/rubert-tiny-toxicity were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | bert    | BertModel  | 11.8 M
1 | fc2     | Linear     | 80.1 K
2 | heads   | ModuleList | 2.3 K 
3 | dropout | Drop

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [57]:
scores

{'predictions': tensor([[1, 1, 1],
         [1, 1, 1],
         [1, 0, 1],
         ...,
         [1, 1, 1],
         [1, 1, 1],
         [1, 2, 1]]),
 'scores': [0.5174703745486348, 0.5502895815352725, 0.43371814235001777],
 'accuracy_scores': [0.7033931105663785,
  0.7297122951877177,
  0.8086698490517352]}

In [48]:

for head in range(3):
    f1_scores = []
    accuracy_scores = []
    for i in range(1):
        f1_scores.append(all_results[head][i]['scores'][0])
        accuracy_scores.append(all_results[head][i]['accuracy_scores'][0])
        
    f1_scores = np.round(f1_scores, 2).mean()
    accuracy_scores = np.round(accuracy_scores, 2).mean()
    print(f1_scores, accuracy_scores)


0.52 0.68
0.56 0.7
0.49 0.79


In [24]:
label

'label_is_ethicity_dangerous_meaning'

In [42]:
all_results[0][0]

{'predictions': tensor([[1],
         [1],
         [1],
         ...,
         [1],
         [1],
         [1]]),
 'scores': [0.4817457929717897],
 'accuracy_scores': [0.7227454521997162]}

In [104]:
accuracy_scores

[0.8061029879211697, 0.8424454333545243, 0.9102564102564102]

In [112]:
f1_scores = np.mean(f1_scores, axis=0)
np.round(f1_scores, 2)

array([0.55, 0.62, 0.46])

In [111]:
accuracy_scores = np.mean(accuracy_scores, axis=0)
np.round(accuracy_scores, 2)

array([0.81, 0.84, 0.91])

In [110]:
import numpy as np

f1_scores = []
accuracy_scores = []

    
for random_seed in range(4):
    f1_scores.append(unbalanced_results[random_seed]['scores'])
    accuracy_scores.append(all_results[random_seed]['accuracy_scores'])

In [99]:
f1_scores = np.mean(f1_scores, axis=0)
f1_scores

array([0.54811702, 0.61588839, 0.4561468 ])

In [69]:
# unbalanced_results

In [97]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
k = 2
print(metrics.classification_report(test_loader.dataset.targets[:, k], predictions[:, k]))
print(metrics.confusion_matrix(test_loader.dataset.targets[:, k], predictions[:, k]))


              precision    recall  f1-score   support

           0       0.15      0.30      0.20       186
           1       0.92      0.84      0.88      3771
           2       0.28      0.40      0.33       348

    accuracy                           0.78      4305
   macro avg       0.45      0.51      0.47      4305
weighted avg       0.84      0.78      0.81      4305

[[  55   98   33]
 [ 272 3180  319]
 [  46  164  138]]
