In [None]:
# Работа состоит из несольких частей
    - код
    - гипотезы
    - текст
    - обзор литературы

Мы хотим оценить влияние различных адверсариальных атак на современные NLP модели

В рамках данного исследования мы остановимся на задаче классификации

Следовательно: нам нужны
    - Датасеты для классификации
        - 2 на Английском (common domain, specific domain)
        - 2 на Русском (common domain, specific domain)
        
    - Модели для классификации
        - Мы можем использовать TF-IDF на log-reg
        - Bert и его разновидности
            - Английский Берт (Bert-base-uncased, distilled-bert-uncades)
            - Мультиязычный Берт ?
            - Русский Берт (DeepPavlov, дистилированная модель от Давида Деле)
            
    - Атаки:
        - BAE
        - TextFooler
        - Другие атаки из модуля TextAtack

Постановка эксперимента:
    - Пока что не трогаем лог-рег
    - Исследуем Берт
    
Задача1:
    - fine-tuning Берта под задачу классификации
    - Оценка качества на валидации
    - Подготовка адверсариальных примеров на основе валидационного датасета
    - Оценка качества на адверсариальных примерах

Этапы работы:
    1) Загрузка и препроцессинг данных
    2) Fine-tuning соответствующей модели
    3) Валидация
    4) Генерация адверсариальных примеров
    5) Оценка качества
        - Автоматическая валидация:
            - accuracy
            - semantic score
        - Human evaluation
            - Классификация примеров
            - Оценка "реалистичности и грамотности сгенерированных примеров"

## Step 1

In [None]:
построим пайплайн на основе ноутбука https://www.kaggle.com/kashnitsky/distillbert-catalyst-amazon-product-reviews

%pip install -U catalyst transformers


In [15]:
%pip install -U catalyst transformers > /dev/null


You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [17]:
%pip install git+https://github.com/catalyst-team/catalyst@master --upgrade


Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/catalyst-team/catalyst@master
  Cloning https://github.com/catalyst-team/catalyst (to revision master) to /tmp/pip-req-build-ohzvqj2r
  Running command git clone -q https://github.com/catalyst-team/catalyst /tmp/pip-req-build-ohzvqj2r
    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: catalyst
  Building wheel for catalyst (PEP 517) ... [?25ldone
[?25h  Created wheel for catalyst: filename=catalyst-21.12rc0-py3-none-any.whl size=536895 sha256=2d6a1dab6672161c1d8b48ab51250509504aad3a7ee4ca415cc065eb5b05e0b7
  Stored in directory: /tmp/pip-ephem-wheel-cache-ld8v2rz7/wheels/91/6b/6a/134b3760024ee0155acd92bb4627ac0642505795c7cf20f204
Successfully built catalyst
Installing collected packages: catalyst
  Attempting uninstall: catalyst
    Found existing installation: catalyst 21.11
    Uninstalling catalyst-21.11:
      Successfully uninstalled ca

In [19]:
# Python 
import os
import warnings
import logging
from typing import Mapping, List
from pprint import pprint

# Numpy and Pandas 
import numpy as np
import pandas as pd

# PyTorch 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Transformers 
from transformers import AutoConfig, AutoModel, AutoTokenizer

# Catalyst
from catalyst.dl import SupervisedRunner
#from catalyst.dl.callbacks import AccuracyCallback, F1ScoreCallback, OptimizerCallback
#from catalyst.dl.callbacks import CheckpointCallback, InferCallback
from catalyst.utils import set_global_seed, prepare_cudnn

In [20]:
MODEL_NAME = 'distilbert-base-uncased' # pretrained model from Transformers
LOG_DIR = "./logdir_amazon_reviews"    # for training logs and tensorboard visualizations
NUM_EPOCHS = 3                         # smth around 2-6 epochs is typically fine when finetuning transformers
BATCH_SIZE = 72                        # depends on your available GPU memory (in combination with max seq length)
MAX_SEQ_LENGTH = 256                   # depends on your available GPU memory (in combination with batch size)
LEARN_RATE = 5e-5                      # learning rate is typically ~1e-5 for transformers
ACCUM_STEPS = 4                        # one optimization step for that many backward passes
SEED = 17                              # random seed for reproducibility

## Dataset

In [None]:

Amazon product reviews - competition. Given text of a review, we need to classify it into one of 6 categories: dogs, cats, fish aquatic pets, birds, and two others.

In [None]:
#!unzip data/amazon-pet-product-reviews-classification.zip


In [31]:
# to reproduce, download the data and customize this path
PATH_TO_DATA = 'data/'


In [32]:
train_df = pd.read_csv(PATH_TO_DATA + 'train.csv', index_col='id').fillna('')
valid_df = pd.read_csv(PATH_TO_DATA + 'valid.csv', index_col='id').fillna('')
test_df = pd.read_csv(PATH_TO_DATA + 'test.csv', index_col='id').fillna('')

In [33]:
train_df.head()


Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Sam has an everlast treat each nite before bed...,dogs
1,The product is as it says. I keep an eye on it...,dogs
2,My Kitty thinks these are treats! He loves the...,dogs
3,This is the third or fourth time that we've or...,dogs
4,Put this on both my dogs. And they are scratch...,dogs


In [34]:
# target distribution
train_df['label'].value_counts(normalize=True)

dogs                    0.537872
cats                    0.355284
fish aquatic pets       0.069001
birds                   0.020324
bunny rabbit central    0.010950
small animals           0.006570
Name: label, dtype: float64

In [35]:
# statistics of text length (in words)
train_df['text'].apply(lambda s: len(s.split())).describe()

count    52057.000000
mean        84.420443
std         80.027988
min          1.000000
25%         35.000000
50%         61.000000
75%        106.000000
max       2360.000000
Name: text, dtype: float64

## Torch Dataset
This is left for user to be defined. Catalyst will take care of the rest.

In [36]:
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """
    def __init__(self,
                 texts: List[str],
                 labels: List[str] = None,
                 label_dict: Mapping[str, int] = None,
                 max_seq_length: int = 512,
                 model_name: str = 'distilbert-base-uncased'):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)),
                                       range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger(
            "transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self):
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        x = self.texts[index]
        x_encoded = self.tokenizer.encode(
            x,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            return_tensors="pt",
        ).squeeze(0)

        # padding short texts
        true_seq_length = x_encoded.size(0)
        pad_size = self.max_seq_length - true_seq_length
        pad_ids = torch.Tensor([self.pad_vid] * pad_size).long()
        x_tensor = torch.cat((x_encoded, pad_ids))

        # dealing with attention masks - there's a 1 for each input token and
        # if the sequence is shorter that `max_seq_length` then the rest is
        # padded with zeroes. Attention mask will be passed to the model in
        # order to compute attention scores only with input data
        # ignoring padding
        mask = torch.ones_like(x_encoded, dtype=torch.int8)
        mask_pad = torch.zeros_like(pad_ids, dtype=torch.int8)
        mask = torch.cat((mask, mask_pad))

        output_dict = {
            "features": x_tensor,
            'attention_mask': mask
        }

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor(
                [self.label_dict.get(y, -1)]
            ).long().squeeze(0)
            output_dict["targets"] = y_encoded

        return output_dict

Create Torch Datasets with train, validation, and test data.



In [37]:
train_dataset = TextClassificationDataset(
    texts=train_df['text'].values.tolist(),
    labels=train_df['label'].values.tolist(),
    label_dict=None,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

valid_dataset = TextClassificationDataset(
    texts=valid_df['text'].values.tolist(),
    labels=valid_df['label'].values.tolist(),
    label_dict=train_dataset.label_dict,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

test_dataset = TextClassificationDataset(
    texts=test_df['text'].values.tolist(),
    labels=None,
    label_dict=None,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=483.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




We infer the number of classes from the training set.



In [38]:
NUM_CLASSES = len(train_dataset.label_dict)


In [39]:
train_df.loc[1]


text     The product is as it says. I keep an eye on it...
label                                                 dogs
Name: 1, dtype: object

In [40]:
pprint(train_dataset[1])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int8),
 'features': tensor([  101,  1996,  4031,  2003,  2004,  2009,  2758,  1012,  1045,  2562,
         2019,

Finally, we define standard PyTorch loaders. This dictionary will be fed to Catalyst.



In [42]:
train_val_loaders = {
    "train": DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=True),
    "valid": DataLoader(dataset=valid_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False)    
}

## The model¶
It's going to be a slightly simplified version of DistilBertForSequenceClassification by HuggingFace.<br> We need only predicted probabilities as output, nothing more - we don't need neither loss to be output nor hidden states or attentions (as in the original implementation).

A good overview of DistilBERT is done in this great post by Jay Alammar.

In [43]:
class DistilBertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes)

        self.distilbert = AutoModel.from_pretrained(pretrained_model_name,
                                                    config=config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, num_classes)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class probabilities
        """
        assert attention_mask is not None, "attention mask is none"
        distilbert_output = self.distilbert(input_ids=features,
                                            attention_mask=attention_mask,
                                            head_mask=head_mask)
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        # we take embeddings from the [CLS] token, so again index 0
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, dim)

        return logits

In [44]:
model = DistilBertForSequenceClassification(pretrained_model_name=MODEL_NAME,
                                            num_classes=NUM_CLASSES)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model training
First we specify optimizer and scheduler (pure PyTorch). Then Catalyst stuff.

In [46]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)

#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

#заменим шидулер,  чтобы не ругался:



To run Deep Learning experiments, Catalyst resorts to the Runner abstraction, in particular, to SupervisedRunner.

SupervisedRunner implements the following methods:

train - starts the training process of the model
predict_loader - makes a prediction on the whole loader with the specified model
infer - makes the inference on the model
To train the model within this interface you pass the following to the train method:

model (torch.nn.Module) – PyTorch model to train
criterion (nn.Module) – PyTorch criterion function for training
optimizer (optim.Optimizer) – PyTorch optimizer for training
loaders (dict) – dictionary containing one or several torch.utils.data.DataLoader for training and validation
logdir (str) – path to output directory. There Catalyst will write logs, will dump the best model and the actual code to train the model
callbacks – list of Catalyst callbacks
scheduler (optim.lr_scheduler._LRScheduler) – PyTorch scheduler for training
...
In our case we'll pass the created DistilBertForSequenceClassification model, cross-entropy criterion, Adam optimizer, scheduler and data loaders that we created earlier. Also, we'll be tracking accuracy and thus will need AccuracyCallback. To perform batch accumulation, we'll be using OptimizationCallback.

There are many more useful callbacks implemented, also check out Catalyst examples.

In [47]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"    # can be changed in case of multiple GPUs onboard
set_global_seed(SEED)                       # reproducibility
prepare_cudnn(deterministic=True)           # reproducibility

In [None]:
!!На этом замечательно моменте все сломалось - из за того, что не смогли заимпортить

from catalyst.dl.callbacks import AccuracyCallback, F1ScoreCallback, OptimizerCallback
from catalyst.dl.callbacks import CheckpointCallback, InferCallback

результат - ModuleNotFoundError: No module named 'catalyst.dl.callbacks'
видимо, код изменился настолько, что с этой частью нужно разбираться

In [52]:
import catalyst
from catalyst import dl, metrics, utils
catalyst.__version__

'21.11'

In [None]:
Запустили процесс на цпу

In [None]:
ВОПРОС: и все таки, куда пропали callbacks?

In [None]:
#!g1.1
%%time
# here we specify that we pass masks to the runner. So model's forward method will be called with
# these arguments passed to it. 
runner = SupervisedRunner(
    input_key=(
        "features",
        "attention_mask"
    )
)


# model training
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    loaders=train_val_loaders,
   
    logdir=LOG_DIR,
    num_epochs=NUM_EPOCHS,
    verbose=True
)

In [None]:
посмотрим, сможет ли цпу за разумное время что то посчтитать?

In [None]:
%%time
# here we specify that we pass masks to the runner. So model's forward method will be called with
# these arguments passed to it. 
runner = SupervisedRunner(
    input_key=(
        "features",
        "attention_mask"
    )
)


# model training
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    loaders=train_val_loaders,
   
    logdir=LOG_DIR,
    num_epochs=NUM_EPOCHS,
    verbose=True
)

HBox(children=(FloatProgress(value=0.0, description='1/3 * Epoch (train)', max=724.0, style=ProgressStyle(desc…

вывод - на цпу ждать можно бесконечно!

In [None]:
!nvidia-smi
