In [118]:
#!g1.1

!git status

In [1]:
#!g1.1
print('hello cudab')

# Работа состоит из несольких частей
    - код
    - гипотезы
    - текст
    - обзор литературы

Мы хотим оценить влияние различных адверсариальных атак на современные NLP модели

В рамках данного исследования мы остановимся на задаче классификации

Следовательно: нам нужны
    - Датасеты для классификации
        - 2 на Английском (common domain, specific domain)
        - 2 на Русском (common domain, specific domain)
        
    - Модели для классификации
        - Мы можем использовать TF-IDF на log-reg
        - Bert и его разновидности
            - Английский Берт (Bert-base-uncased, distilled-bert-uncades)
            - Мультиязычный Берт ?
            - Русский Берт (DeepPavlov, дистилированная модель от Давида Деле)
            
    - Атаки:
        - BAE
        - TextFooler
        - Другие атаки из модуля TextAtack

Постановка эксперимента:
    - Пока что не трогаем лог-рег
    - Исследуем Берт
    
Задача1:
    - fine-tuning Берта под задачу классификации
    - Оценка качества на валидации
    - Подготовка адверсариальных примеров на основе валидационного датасета
    - Оценка качества на адверсариальных примерах

Этапы работы:
    1) Загрузка и препроцессинг данных
    2) Fine-tuning соответствующей модели
    3) Валидация
    4) Генерация адверсариальных примеров
    5) Оценка качества
        - Автоматическая валидация:
            - accuracy
            - semantic score
        - Human evaluation
            - Классификация примеров
            - Оценка "реалистичности и грамотности сгенерированных примеров"

## Step 1

построим пайплайн на основе ноутбука https://www.kaggle.com/kashnitsky/distillbert-catalyst-amazon-product-reviews

%pip install -U catalyst transformers


In [None]:
#!g1.1
#%pip install -U transformers


In [None]:
# %pip install git+https://github.com/catalyst-team/catalyst@master --upgrade


In [6]:
#!g1.1
# Python 
import os
import warnings
import logging
from typing import Mapping, List
from pprint import pprint

# Numpy and Pandas 
import numpy as np
import pandas as pd

# PyTorch 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Transformers 
from transformers import AutoConfig, AutoModel, AutoTokenizer

# Catalyst
from catalyst.dl import SupervisedRunner
from catalyst import dl  # импорт вместо catalyst.dl.callbacks

#from catalyst.dl.callbacks import AccuracyCallback, F1ScoreCallback, OptimizerCallback
#from catalyst.dl.callbacks import CheckpointCallback, InferCallback
from catalyst.utils import set_global_seed, prepare_cudnn

In [7]:
#!g1.1
MODEL_NAME = 'distilbert-base-uncased' # pretrained model from Transformers
LOG_DIR = "./logdir_amazon_reviews"    # for training logs and tensorboard visualizations
NUM_EPOCHS = 3                         # smth around 2-6 epochs is typically fine when finetuning transformers
BATCH_SIZE = 72                        # depends on your available GPU memory (in combination with max seq length)
MAX_SEQ_LENGTH = 256                   # depends on your available GPU memory (in combination with batch size)
LEARN_RATE = 5e-5                      # learning rate is typically ~1e-5 for transformers
ACCUM_STEPS = 4                        # one optimization step for that many backward passes
SEED = 17                              # random seed for reproducibility

## Dataset


Amazon product reviews - competition. Given text of a review, we need to classify it into one of 6 categories: dogs, cats, fish aquatic pets, birds, and two others.

In [None]:
#!unzip data/amazon-pet-product-reviews-classification.zip


In [11]:
#!g1.1
# to reproduce, download the data and customize this path
PATH_TO_DATA = 'data/'


In [12]:
#!g1.1

#загрузим данные из csv
train_df = pd.read_csv(PATH_TO_DATA + 'train.csv', index_col='id').fillna('')
valid_df = pd.read_csv(PATH_TO_DATA + 'valid.csv', index_col='id').fillna('')
test_df = pd.read_csv(PATH_TO_DATA + 'test.csv', index_col='id').fillna('')

In [13]:
#!g1.1
train_df.shape, valid_df.shape, test_df.shape

In [None]:
#!g1.1
52к наблюдений в трейне, 17к в валидации, 17 к в тесте, в тесте - нет меток класса

In [None]:
#!g1.1

Идея - в нашем случае - можем проверить результат модели как на валидации, так и отправкой сабмита на кээгл с adv примерами (результатами предсказаний)

In [14]:
#!g1.1
train_df.head()


In [16]:
#!g1.1
train_df.text[4]
# явно отзыв по собакам

In [64]:
#!g1.1
# target distribution
train_df['label'].value_counts(normalize=True)

In [65]:
#!g1.1
# statistics of text length (in words)
train_df['text'].apply(lambda s: len(s.split())).describe()

## Torch Dataset
This is left for user to be defined. Catalyst will take care of the rest.

In [17]:
#!g1.1

# нужно разобраться, как под другой датасет переписать класс
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """
    def __init__(self,
                 texts: List[str],
                 labels: List[str] = None,
                 label_dict: Mapping[str, int] = None,
                 max_seq_length: int = 512,  #ограничение берта на длину последовательности в 512 токенов
                 model_name: str = 'distilbert-base-uncased'):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)),
                                       range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger(
            "transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self):
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        x = self.texts[index]
        x_encoded = self.tokenizer.encode(
            x,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            return_tensors="pt",
        ).squeeze(0)

        # padding short texts
        true_seq_length = x_encoded.size(0)
        pad_size = self.max_seq_length - true_seq_length
        pad_ids = torch.Tensor([self.pad_vid] * pad_size).long()
        x_tensor = torch.cat((x_encoded, pad_ids))

        # dealing with attention masks - there's a 1 for each input token and
        # if the sequence is shorter that `max_seq_length` then the rest is
        # padded with zeroes. Attention mask will be passed to the model in
        # order to compute attention scores only with input data
        # ignoring padding
        mask = torch.ones_like(x_encoded, dtype=torch.int8)
        mask_pad = torch.zeros_like(pad_ids, dtype=torch.int8)
        mask = torch.cat((mask, mask_pad))

        output_dict = {
            "features": x_tensor,
            'attention_mask': mask
        }

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor(
                [self.label_dict.get(y, -1)]
            ).long().squeeze(0)
            output_dict["targets"] = y_encoded

        return output_dict

Create Torch Datasets with train, validation, and test data.



In [18]:
#!g1.1
train_dataset = TextClassificationDataset(
    texts=train_df['text'].values.tolist(),
    labels=train_df['label'].values.tolist(),
    label_dict=None,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

valid_dataset = TextClassificationDataset(
    texts=valid_df['text'].values.tolist(),
    labels=valid_df['label'].values.tolist(),
    label_dict=train_dataset.label_dict,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

test_dataset = TextClassificationDataset(
    texts=test_df['text'].values.tolist(),
    labels=None,
    label_dict=None,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

We infer the number of classes from the training set.



In [19]:
#!g1.1
NUM_CLASSES = len(train_dataset.label_dict)


In [20]:
#!g1.1
train_df.loc[1]


In [None]:
#!g1.1
#мы видим, что в train_df - только описание и лейбл
# трайн датасет - выплевывает обработанную последовательность в виде словаря
#  attention_mask - 1 для токенов, 0 - для pad
# 'features' - это индексы токенов из словаря 101 - CLS, и так далее
# targets - метка класса (лейбл)

In [21]:
#!g1.1
pprint(train_dataset[1])


In [None]:
#!g1.1
# последовательность - Датасет => Даталоадер (видим, что в трейн даталоадаре и трейн и валидация)

Finally, we define standard PyTorch loaders. This dictionary will be fed to Catalyst.



In [22]:
#!g1.1
train_val_loaders = {
    "train": DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=True),
    "valid": DataLoader(dataset=valid_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False)    
}

## The model¶
It's going to be a slightly simplified version of DistilBertForSequenceClassification by HuggingFace.<br> We need only predicted probabilities as output, nothing more - we don't need neither loss to be output nor hidden states or attentions (as in the original implementation).

A good overview of DistilBERT is done in this great post by Jay Alammar.

In [23]:
#!g1.1
class DistilBertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes)

        self.distilbert = AutoModel.from_pretrained(pretrained_model_name,
                                                    config=config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, num_classes)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class probabilities
        """
        assert attention_mask is not None, "attention mask is none"
        distilbert_output = self.distilbert(input_ids=features,
                                            attention_mask=attention_mask,
                                            head_mask=head_mask)
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        # we take embeddings from the [CLS] token, so again index 0
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, dim)

        return logits

In [24]:
#!g1.1
model = DistilBertForSequenceClassification(pretrained_model_name=MODEL_NAME,
                                            num_classes=NUM_CLASSES)

Model training
First we specify optimizer and scheduler (pure PyTorch). Then Catalyst stuff.

In [None]:
#!g1.1
#как правильно задать шедулер?

In [25]:
#!g1.1
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)

#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

#заменим шидулер,  чтобы не ругался:



To run Deep Learning experiments, Catalyst resorts to the Runner abstraction, in particular, to SupervisedRunner.

SupervisedRunner implements the following methods:

train - starts the training process of the model
predict_loader - makes a prediction on the whole loader with the specified model
infer - makes the inference on the model
To train the model within this interface you pass the following to the train method:

model (torch.nn.Module) – PyTorch model to train
criterion (nn.Module) – PyTorch criterion function for training
optimizer (optim.Optimizer) – PyTorch optimizer for training
loaders (dict) – dictionary containing one or several torch.utils.data.DataLoader for training and validation
logdir (str) – path to output directory. There Catalyst will write logs, will dump the best model and the actual code to train the model
callbacks – list of Catalyst callbacks
scheduler (optim.lr_scheduler._LRScheduler) – PyTorch scheduler for training
...
In our case we'll pass the created DistilBertForSequenceClassification model, cross-entropy criterion, Adam optimizer, scheduler and data loaders that we created earlier. Also, we'll be tracking accuracy and thus will need AccuracyCallback. To perform batch accumulation, we'll be using OptimizationCallback.

There are many more useful callbacks implemented, also check out Catalyst examples.

In [26]:
#!g1.1
os.environ['CUDA_VISIBLE_DEVICES'] = "0"    # can be changed in case of multiple GPUs onboard
set_global_seed(SEED)                       # reproducibility
prepare_cudnn(deterministic=True)           # reproducibility

!!На этом замечательно моменте все сломалось - из за того, что не смогли заимпортить

from catalyst.dl.callbacks import AccuracyCallback, F1ScoreCallback, OptimizerCallback
from catalyst.dl.callbacks import CheckpointCallback, InferCallback

результат - ModuleNotFoundError: No module named 'catalyst.dl.callbacks'
видимо, код изменился настолько, что с этой частью нужно разбираться

In [76]:
#!g1.1
import catalyst
from catalyst import dl, metrics, utils
catalyst.__version__

In [27]:
#!g1.1
from catalyst.dl import AccuracyCallback


In [None]:
#!g1.1
ВОПРОС: и все таки, куда пропали callbacks?

In [37]:
#!g1.1

class BertRunner(SupervisedRunner):
    def _handle_batch(self, batch):
        self.input = batch
        self.output = self.model(**{k: batch[k] for k in self.input_key}, return_dict=True)


runner = BertRunner(input_key=["features", "attention_mask"])

In [None]:
#!g1.1
начал выполняться в 3-13  закончил в 3-35

In [38]:
#!g1.1


runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=train_val_loaders,
    logdir=LOG_DIR,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(input_key="logits", target_key="targets", num_classes=NUM_CLASSES)]
)

In [None]:
#!g1.1
test_loaders

In [41]:
#!g1.1


pred1 = next(runner.predict_loader(loader=test_loaders["test"]))

In [48]:
#!g1.1

pred1['logits'].shape

In [None]:
#!g1.1

пока что мы получили лишь 72, должны - 17353

In [67]:
#!g1.1

pred2 = [p['logits'].cpu().numpy().argmax() for p in runner.predict_loader(loader=test_loaders["test"])]

In [68]:
#!g1.1

type(pred2)

In [70]:
#!g1.1
len((pred2))

In [71]:
#!g1.1
type(pred2[0]), pred2[0].shape

In [72]:
#!g1.1
pred2[0]

In [80]:
#!g1.1

from catalyst.dl.utils import plot_metrics

utils.plot_metrics

In [None]:
#!g1.1


In [None]:
#!g1.1


# plot_metrics(
#     logdir=LOG_DIR,
#     step='batch',
#     metrics=['accuracy01']
# )

In [None]:
#!g1.1
1+1

In [None]:
#!g1.1


## Inference for the test set
Let's create a Torch loader for the test set and launch infer to actually make predictions fot the test set.<br> First, we load the best model checkpoint, then make inference with this model.

In [40]:
#!g1.1
test_loaders = {
    "test": DataLoader(dataset=test_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False) 
}


In [100]:
#!g1.1
features_batch = next(iter(test_loaders["test"]))
prediction_batch = runner.predict_batch(features_batch)

In [108]:
#!g1.1
prediction_batch['logits'].shape

In [109]:
#!g1.1
train_df.shape, valid_df.shape, test_df.shape

In [None]:
#!g1.1
#похоже метод infer так же безнадежно устарел()

In [98]:
#!g1.1
runner.predict_loader(test_loaders, model=model)

In [117]:
#!g1.1
for prediction in runner.predict_loader(loader=valid_dataset):
    assert prediction.detach().cpu().numpy().shape[-1] == 10

In [None]:
#!g1.1

# model batch inference
features_batch = next(iter(loaders["valid"]))[0]
prediction_batch = runner.predict_batch(features_batch)
# model loader inference
for prediction in runner.predict_loader(loader=loaders["valid"]):
    assert prediction.detach().cpu().numpy().shape[-1] == 10

In [None]:
#!g1.1
predicted_probs = runner.callbacks[0].predictions['logits']


In [None]:
#!g1.1
sample_sub_df = pd.read_csv(PATH_TO_DATA + 'sample_submission.csv',
                           index_col='id')

In [None]:
#!g1.1
train_dataset.label_dict


In [None]:
#!g1.1
sample_sub_df['label'] = predicted_probs.argmax(axis=1)
sample_sub_df['label'] = sample_sub_df['label'].map({v:k for k, v in train_dataset.label_dict.items()})

In [None]:
#!g1.1
sample_sub_df.head()


In [None]:
#!g1.1
sample_sub_df.to_csv('distillbert_submission.csv')