<a href="https://colab.research.google.com/github/Droid008/Introduction-to-the-processing-of-a-natural-language/blob/lesson-%2313/HW_13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import torch

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

In [4]:
dataset = load_dataset('merionum/ru_paraphraser')
dataset

Using custom data configuration merionum--ru_paraphraser-1a7592429d7be082


Downloading and preparing dataset json/merionum--ru_paraphraser to /root/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-1a7592429d7be082/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/605k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-1a7592429d7be082/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 7227
    })
    test: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 1924
    })
})

In [5]:
label_list = list(set(dataset['train']['class']))
label_list

['0', '-1', '1']

In [6]:
def one_hot_encoding(example):
    example_class = example['class']
    oh1 = example_class =='-1'
    oh2 = example_class =='0'
    oh3 = example_class =='1'
    return {'class_-1': oh1, 'class_0': oh2, 'class_1': oh3}

In [7]:
ohe_dataset = dataset.map(one_hot_encoding)



  0%|          | 0/7227 [00:00<?, ?ex/s]

  0%|          | 0/1924 [00:00<?, ?ex/s]

In [8]:
ohe_dataset['train'][0]

{'class': '0',
 'class_-1': False,
 'class_0': True,
 'class_1': False,
 'id': '1',
 'id_1': '201',
 'id_2': '8159',
 'text_1': 'Полицейским разрешат стрелять на поражение по гражданам с травматикой.',
 'text_2': 'Полиции могут разрешить стрелять по хулиганам с травматикой.'}

In [9]:
labels = ['class_-1', 'class_0', 'class_1']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
id2label

{0: 'class_-1', 1: 'class_0', 2: 'class_1'}

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # take a batch of texts
    text_1 = examples["text_1"]
    text_2 = examples["text_2"]
    # encode them
    encoding = tokenizer(text_1, text_2, padding="max_length", truncation=True, max_length=128)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text_1), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
        
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [11]:
encoded_dataset = ohe_dataset.map(preprocess_data, batched=True, remove_columns=ohe_dataset['train'].column_names)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [13]:
example['labels']

[0.0, 1.0, 0.0]

In [14]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['class_0']

In [15]:
encoded_dataset.set_format("torch")

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [18]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  1194, 14150, 29436, 10325, 29751, 15290, 10325, 29747, 23925,
        10325, 29745,  1195, 10260, 29744, 16856, 15290, 29753, 10260, 22919,
         1196, 22919, 16856, 15290, 29436, 17432, 22919, 23742,  1192, 10260,
         1194, 14150, 16856, 10260, 29743, 15290, 18947, 10325, 15290,  1194,
        14150,  1183, 16856, 10260, 29743, 29742, 28995, 10260, 29745,  1196,
         1197, 16856, 10260, 25529, 29745, 10260, 22919, 10325, 23925, 14150,
        10325,  1012,   102,  1194, 14150, 29436, 10325, 29751, 15414,  1191,
        14150, 29741, 29748, 22919,  1195, 10260, 29744, 16856, 15290, 29753,
        10325, 22919, 23742,  1196, 22919, 16856, 15290, 29436, 17432, 22919,
        23742,  1194, 14150,  1200, 29748, 29436, 10325, 29741, 28995, 10260,
        29745,  1196,  1197, 16856, 10260, 25529, 29745, 10260, 22919, 10325,
        23925, 14150, 10325,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [19]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0),
                labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput([('loss',
                           tensor(0.8057, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)),
                          ('logits',
                           tensor([[ 0.1866, -0.1973,  0.2564]], grad_fn=<AddmmBackward0>))])

In [20]:
softmax = torch.nn.Softmax(dim=-1)

In [21]:
def predict(idx):
    print(f"Text_1: {dataset['test'][idx]['text_1']}")
    print(f"Text_2: {dataset['test'][idx]['text_2']}")
    print(f"Class: {dataset['test'][idx]['class']}")
    print(f"-------------------------------------------")
    encoding = tokenizer(dataset['test'][idx]['text_1'], dataset['test'][idx]['text_2'], return_tensors="pt")
    encoding = {k: v.to(model.device) for k,v in encoding.items()}
    outputs = model(**encoding)
    probs = softmax(outputs.logits.squeeze().cpu()).detach().numpy()
    print(f"Probs: {probs}")
    print(f"Label: {model.config.id2label[probs.argmax(axis=-1)]}")

In [22]:
predict(0)

Text_1: Цены на нефть восстанавливаются
Text_2: Парламент Словакии поблагодарил народы бывшего СССР за Победу
Class: -1
-------------------------------------------
Probs: [0.2506157  0.26115435 0.48823002]
Label: class_1


In [23]:
predict(5)

Text_1: Вертолет с 11 иностранцами на борту упал в Пакистане
Text_2: В Пакистане упал вертолет с 11 иностранцами
Class: 1
-------------------------------------------
Probs: [0.19551143 0.2549332  0.5495553 ]
Label: class_1


In [24]:
batch_size = 8
metric_name = "f1"

In [25]:
small_train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = encoded_dataset["test"].shuffle(seed=42).select(range(200))

In [26]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    #sigmoid = torch.nn.Sigmoid()
    #probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    #y_pred = np.zeros(probs.shape)
    #y_pred[np.where(probs >= threshold)] = 1
    
    # first, apply softmax on predictions which are of shape (batch_size, num_labels)
    softmax = torch.nn.Softmax(dim=-1)
    probs = softmax(torch.Tensor(predictions))
    #print(f'probs:\n{probs}')
    #print(f'probs.argmax:\n{probs.argmax(axis=-1)}')

    # next, use argmax to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    for i in range(len(y_pred)):
        y_pred[i, probs[i].argmax(axis=-1)] = 1
    #print(f'y_pred:\n{y_pred}')
    
    # finally, compute metrics
    y_true = labels
    #print(f'y_true:\n{y_true}')

    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [27]:
args = TrainingArguments(
    f"bert-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [29]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.646491,0.465,0.59875,0.465
2,No log,0.564519,0.495,0.62125,0.495
3,No log,0.577715,0.54,0.655,0.54


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-125
Configuration saved in bert-finetuned/checkpoint-125/config.json
Model weights saved in bert-finetuned/checkpoint-125/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-125/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-125/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-250
Configuration saved in bert-finetuned/checkpoint-250/config.json
Model weights saved in bert-finetuned/checkpoint-250/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-250/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-375
Configuration save

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.646491,0.465,0.59875,0.465
2,No log,0.564519,0.495,0.62125,0.495
3,No log,0.577715,0.54,0.655,0.54
4,0.485900,0.652821,0.5,0.625,0.5
5,0.485900,0.635676,0.505,0.62875,0.505


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-500
Configuration saved in bert-finetuned/checkpoint-500/config.json
Model weights saved in bert-finetuned/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-625
Configuration saved in bert-finetuned/checkpoint-625/config.json
Model weights saved in bert-finetuned/checkpoint-625/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-625/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-625/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-finetuned/checkpoint-375 (sco

TrainOutput(global_step=625, training_loss=0.46155119018554686, metrics={'train_runtime': 7349.7047, 'train_samples_per_second': 0.68, 'train_steps_per_second': 0.085, 'total_flos': 328891772160000.0, 'train_loss': 0.46155119018554686, 'epoch': 5.0})

In [30]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


{'epoch': 5.0,
 'eval_accuracy': 0.54,
 'eval_f1': 0.54,
 'eval_loss': 0.5777146816253662,
 'eval_roc_auc': 0.655,
 'eval_runtime': 84.8266,
 'eval_samples_per_second': 2.358,
 'eval_steps_per_second': 0.295}

In [31]:
predict(0)

Text_1: Цены на нефть восстанавливаются
Text_2: Парламент Словакии поблагодарил народы бывшего СССР за Победу
Class: -1
-------------------------------------------
Probs: [0.98150533 0.01278322 0.00571152]
Label: class_-1


In [32]:
predict(5)

Text_1: Вертолет с 11 иностранцами на борту упал в Пакистане
Text_2: В Пакистане упал вертолет с 11 иностранцами
Class: 1
-------------------------------------------
Probs: [0.04002372 0.5592385  0.4007378 ]
Label: class_0
