## Loading the preprocessed dataset (serialized)

In [1]:
import pandas as pd

encoded_dataset = pd.read_pickle("../_4_data_preprocessing/output/serialized_encoded_review_dataset.pkl")

In [5]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 30476
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8127
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 16545
    })
})

In [6]:
labels = [
    'F',
    'BR',
    'AU',
    'FI',
    'IR',
    'A',
    'L',
    'LF',
    'MN',
    'O',
    'PE',
    'SC',
    'SE',
    'US',
    'PO'
]

id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

## Train the model!

We are going to train the model using HuggingFace's Trainer API. This requires us to define 2 things:

* `TrainingArguments`, which specify training hyperparameters. All options can be found in the [docs](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments). Below, we for example specify that we want to evaluate after every epoch of training, we would like to save the model every epoch, we set the learning rate, the batch size to use for training/evaluation, how many epochs to train for, and so on.
* a `Trainer` object (docs can be found [here](https://huggingface.co/transformers/main_classes/trainer.html#id1)).

In [8]:
batch_size = 8
metric_name = "f1"

In [19]:
%pip install accelerate -U



In [20]:
output_dir = "./Model/BiGru-Bert/"

In [9]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction



### Define model architecture with `bidirectional GRU` and `CNN` layers on top of `BERT`

In [None]:
class CustomModel(AutoModelForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.gru = torch.nn.GRU(config.hidden_size,
                                config.hidden_size,
                                bidirectional=True,
                                batch_first=True)
        self.conv1d = torch.nn.Conv1d(config.hidden_size * 2,
                                      config.hidden_size,
                                      kernel_size=3,
                                      padding=1)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        outputs = self.bert(input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )
        pooled_output = outputs.pooler_output
        
        # bidirectional GRU layer
        gru_output, _ = self.gru(outputs.last_hidden_state)
        
        # CNN layer
        cnn_output = torch.relu(self.conv1d(gru_output.transpose(1, 2)).transpose(1, 2))
        cnn_output = torch.max_pool1d(cnn_output, kernel_size=cnn_output.shape[1])
        cnn_output = cnn_output.squeeze(2)
        
        # Concatenate pooled_output with CNN output
        pooled_cnn_output = torch.cat([pooled_output, cnn_output], dim=1)
        pooled_cnn_output = self.dropout(pooled_cnn_output)
        
        # Classifier layer
        logits = self.classifier(pooled_cnn_output)
        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.float().view(-1, self.num_labels))
        if return_dict:
            return {"loss": loss, "logits": logits}
        return (loss, logits)

### Define multi-label metrics function

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid() # igmoid on predictions which are of shape (batch_size, num_labels)
    probs = sigmoid(torch.Tensor(predictions)) # use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    
    # compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics # Return as dictionary

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

## Define model

Here we define a model that includes a pre-trained base (i.e. the weights from bert-base-uncased) are loaded, with a random initialized classification head (linear layer) on top. One should fine-tune this head, together with the pre-trained base on a labeled dataset.

This is also printed by the warning.

We set the `problem_type` to be "multi_label_classification", as this will make sure the appropriate loss function is used (namely [`BCEWithLogitsLoss`](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)). We also make sure the output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

### Instantiate the custom model

In [None]:
model = CustomModel.from_pretrained("bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

### Define TrainingArguments

In [None]:
args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

### Instantiate Trainer

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


We are also going to compute metrics while training. For this, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values.

Let's verify a batch as well as a forward pass:

In [22]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [23]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  1045,  1005,  2310,  2018, 10474,  2005,  1037,  2200,  2146,
         2051,  1998,  1045,  2293,  2009,  1010,  2021,  2144,  1996,  6745,
        10651,  1010,  1045,  2064,  1005,  1056,  2330,  2026, 10439,  4902,
         1012,  1045,  1005,  2310,  2701,  1996, 10439,  1998,  2699,  2128,
        26915,  2075,  2009,  1010,  1045,  1005,  2310, 17159,  1998, 19222,
         9080,  3709,  1996, 10439,  2004,  2092,  1010,  2021,  2009,  2180,
         1005,  1056,  2330,  1012,  3531,  8081,  2023,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

Let's start training!

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0571,0.05389,0.856227,0.90096,0.805217
2,0.0411,0.052677,0.85888,0.910383,0.823428
3,0.028,0.057845,0.863898,0.917659,0.83844
4,0.0187,0.063232,0.861312,0.918233,0.840532
5,0.012,0.067345,0.859383,0.918299,0.841147


TrainOutput(global_step=19050, training_loss=0.03490537594622514, metrics={'train_runtime': 5347.2063, 'train_samples_per_second': 28.497, 'train_steps_per_second': 3.563, 'total_flos': 1.002438558125568e+16, 'train_loss': 0.03490537594622514, 'epoch': 5.0})

## Evaluate

After training, we evaluate our model on the validation set.

In [25]:
trainer.evaluate()

{'eval_loss': 0.057845212519168854,
 'eval_f1': 0.8638978443643719,
 'eval_roc_auc': 0.9176590937920462,
 'eval_accuracy': 0.8384397686723268,
 'eval_runtime': 292.4018,
 'eval_samples_per_second': 27.794,
 'eval_steps_per_second': 3.475,
 'epoch': 5.0}

## Inference

Let's test the model on a new sentence:

In [26]:
text = "this app is not available for my phone"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

The logits that come out of the model are of shape (batch_size, num_labels). As we are only forwarding a single sentence through the model, the `batch_size` equals 1. The logits is a tensor that contains the (unnormalized) scores for every individual label.

In [27]:
logits = outputs.logits
logits.shape

torch.Size([1, 15])

To turn them into actual predicted labels, we first apply a sigmoid function independently to every score, such that every score is turned into a number between 0 and 1, that can be interpreted as a "probability" for how certain the model is that a given class belongs to the input text.

Next, we use a threshold (typically, 0.5) to turn every probability into either a 1 (which means, we predict the label for the given example) or a 0 (which means, we don't predict the label for the given example).

In [28]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['A']
