In [1]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: bmarcin. Use `wandb login --relogin` to force relogin


True

In [5]:
dev_ds = "../data/dev/"
test_ds = "../data/test/"
train_ds = "../data/train/"

notebook_path_prefix = "../models/roberta_lm"

In [6]:
special_tokens = [
    '<url>',
    '<email>',
    '<number>',
    '<date>', 
]

In [7]:
from transformers import RobertaConfig, RobertaTokenizerFast

In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained(notebook_path_prefix, max_len=512, use_fast=True)

In [9]:
tokenizer.add_special_tokens({
    'additional_special_tokens': special_tokens
})

0

In [10]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<url>', '<email>', '<number>', '<date>']}

In [11]:
import pandas as pd

In [12]:
data1 = pd.read_csv("../data/dev/in.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)
data2 = pd.read_csv("../data/test/in.tsv", delimiter='\t', header=None, encoding="utf8")
data3 = pd.read_csv("../data/train/in.tsv", delimiter='\t', header=None, encoding="utf8")

In [13]:
labels1 = pd.read_csv("../data/dev/expected.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)
labels2 = pd.read_csv("../data/test/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels3 = pd.read_csv("../data/train/expected.tsv", delimiter='\t', header=None, encoding="utf8")

In [14]:
print(len(data1), len(labels1))

5003 5003


In [15]:
print(len(data1), len(labels1))
print(len(data2), len(labels2))
print(len(data3), len(labels3))

5003 5003
10011 10011
85083 85083


In [16]:
import torch

In [17]:
unique_labels = [
    'economy',
    'law',
    'foreign policy',
    'agriculture',
    'environment',
    'social policy',
    'state',
    'public authorities',
    'taxes',
    'transport',
    'science',
    'research and technology',
    'european union',
    'work and employment',
    'health',
    'education',
    'industry',
    'sports'
]

In [18]:
labels = set([label.strip().lower() for label in labels1.iloc[1][0].split(',')]); labels

{'agriculture'}

In [19]:
class ClassificationDS(torch.utils.data.Dataset):
    def __init__(self, input_texts, input_labels, unique_labels, tokenizer):
        self.input_texts = input_texts
        self.input_labels = input_labels
        self.unique_labels = unique_labels
        self.tokenizer = tokenizer
        self.label2idx = {}
        
        for label in self.unique_labels:
            self.label2idx[label] = len(self.label2idx)
            
        print(self.label2idx)
        
    def __len__(self):
        return len(self.input_texts)
        
    def __getitem__(self, idx):
        tokenized = tokenizer(str(self.input_texts.iloc[idx][1]))
        labels = set([self.label2idx[label.strip().lower()] for label in self.input_labels.iloc[idx][0].split(',')])
        
        item = {
            'input_ids': torch.tensor(tokenized['input_ids']),
            'attention_mask': torch.tensor(tokenized['attention_mask']),
            'labels': torch.zeros([len(self.label2idx)]).index_fill_(0, torch.tensor(list(labels)), 1)
        }
        
        return item

In [20]:
dev_ds = ClassificationDS(data1, labels1, unique_labels, tokenizer)

{'economy': 0, 'law': 1, 'foreign policy': 2, 'agriculture': 3, 'environment': 4, 'social policy': 5, 'state': 6, 'public authorities': 7, 'taxes': 8, 'transport': 9, 'science': 10, 'research and technology': 11, 'european union': 12, 'work and employment': 13, 'health': 14, 'education': 15, 'industry': 16, 'sports': 17}


In [21]:
test_ds = ClassificationDS(data2, labels2, unique_labels, tokenizer)

{'economy': 0, 'law': 1, 'foreign policy': 2, 'agriculture': 3, 'environment': 4, 'social policy': 5, 'state': 6, 'public authorities': 7, 'taxes': 8, 'transport': 9, 'science': 10, 'research and technology': 11, 'european union': 12, 'work and employment': 13, 'health': 14, 'education': 15, 'industry': 16, 'sports': 17}


In [22]:
train_ds = ClassificationDS(data3, labels3, unique_labels, tokenizer)

{'economy': 0, 'law': 1, 'foreign policy': 2, 'agriculture': 3, 'environment': 4, 'social policy': 5, 'state': 6, 'public authorities': 7, 'taxes': 8, 'transport': 9, 'science': 10, 'research and technology': 11, 'european union': 12, 'work and employment': 13, 'health': 14, 'education': 15, 'industry': 16, 'sports': 17}


In [35]:
from transformers import RobertaForSequenceClassification, RobertaConfig, RobertaModel, RobertaPreTrainedModel
import torch.nn as nn

In [36]:
num_labels = len(unique_labels)

In [37]:
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [42]:
class RobertaReccurentMemory(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config)

        self.post_init()
    
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

NameError: name 'Optional' is not defined

In [27]:
model = RobertaModel.from_pretrained(notebook_path_prefix, num_labels=num_labels)

Some weights of the model checkpoint at ../models/roberta_lm were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../models/roberta_lm and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions >= 0.5).astype(int) #.argmax(-1)
    
    # print(labels, preds)
    
    # try:
    acc = accuracy_score(labels, preds)
    # except ValueError:
    
    
    return {
        'accuracy': acc,
        'f1': f1_score(y_true=labels, y_pred=preds, average='weighted'),
        'precision': precision_score(y_true=labels, y_pred=preds, average='weighted'),
        'recall': recall_score(y_true=labels, y_pred=preds, average='weighted')
    }

In [42]:
import numpy as np

In [43]:
from transformers import  Trainer, TrainingArguments

In [44]:
training_args = TrainingArguments(
    output_dir=notebook_path_prefix+"_classification",
    warmup_steps=500,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=17,
    per_device_eval_batch_size=30,
    save_steps=5_000,
    save_total_limit=3,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    logging_steps=700,
    eval_steps=700,
    evaluation_strategy='steps',
    report_to="wandb",
    run_name="roberta-classification"
)

PyTorch: setting up devices


In [45]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_ds,         # training dataset
    eval_dataset=dev_ds,            # evaluation dataset
    compute_metrics=compute_metrics
)

In [46]:
trainer.evaluate(test_ds)

***** Running Evaluation *****
  Num examples = 6858
  Batch size = 30


  _warn_prf(average, modifier, msg_start, len(result))
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 0.7199797034263611,
 'eval_accuracy': 0.00014581510644502772,
 'eval_f1': 0.0002490339082464446,
 'eval_precision': 0.0004896459450760735,
 'eval_recall': 0.00019462826002335538,
 'eval_runtime': 264.9451,
 'eval_samples_per_second': 25.885,
 'eval_steps_per_second': 0.864}

In [47]:
trainer.train()

***** Running training *****
  Num examples = 24003
  Num Epochs = 5
  Instantaneous batch size per device = 17
  Total train batch size (w. parallel, distributed & accumulation) = 17
  Gradient Accumulation steps = 1
  Total optimization steps = 7060


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
700,0.2616,0.104666,0.625547,0.678703,0.81345,0.599293
1400,0.0999,0.071193,0.718868,0.812302,0.915701,0.742358
2100,0.0712,0.060752,0.750948,0.840987,0.918501,0.785402
2800,0.063,0.053062,0.79061,0.8708,0.926771,0.826159
3500,0.0469,0.051354,0.801108,0.88088,0.925592,0.844874
4200,0.0432,0.048688,0.808107,0.887665,0.914598,0.865253
4900,0.0334,0.046776,0.821814,0.89414,0.925298,0.867332
5600,0.0312,0.046743,0.815981,0.892067,0.925757,0.863589
6300,0.0248,0.046144,0.823564,0.896569,0.931438,0.867748
7000,0.0237,0.045806,0.825022,0.89689,0.93083,0.868788


***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3429
  Batch size = 30
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to roberta_lm_classification\checkpoint-5000
Configuration saved in roberta_lm_classification\

TrainOutput(global_step=7060, training_loss=0.06947880826320595, metrics={'train_runtime': 7899.7824, 'train_samples_per_second': 15.192, 'train_steps_per_second': 0.894, 'total_flos': 9395897647933440.0, 'train_loss': 0.06947880826320595, 'epoch': 5.0})

In [48]:
trainer.evaluate(test_ds)

***** Running Evaluation *****
  Num examples = 6858
  Batch size = 30


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.05427917465567589,
 'eval_accuracy': 0.794546515018956,
 'eval_f1': 0.8889033891770258,
 'eval_precision': 0.9228076499900962,
 'eval_recall': 0.8590891397430906,
 'eval_runtime': 303.3825,
 'eval_samples_per_second': 22.605,
 'eval_steps_per_second': 0.755,
 'epoch': 5.0}