# CodeBERT Fine-tuning

In [1]:
%%capture
!pip install numpy
!pip install pandas
!pip install torch
!pip install transformers

In [1]:
import os
import numpy as np
import torch
import pandas as pd
import timeit
import math
import json
import torch.optim as optim
import random
from functools import reduce
from enum import Enum
from collections import Counter
from torch.utils.data import ConcatDataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from torch.nn import Module, Linear, CrossEntropyLoss
from transformers import AutoModel

try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

## **Setup**
* Definition of the dataset path to load the data
* Connection to the GPU (or TPU) made available for free by Google Colab

In [2]:
if IN_COLAB:
    # Connect to google drive (personal account)
    # Change the path according to the position of your dataset in your personal account
    # The next statements require the authorization to connect the colab notebook
    # to your personal account
    drive.mount('/content/drive')
    d_path = os.path.join("drive", "MyDrive", "Colab Notebooks", "asterix")
else:
    # The notebook is run in local or within a server
    # Change the path according to the position of your dataset in the machine
    d_path = os.path.join(os.getcwd(), "..", "..")

In [3]:
d_path

'/Users/davidemolinelli/Documents/phd/repositories/ASTERIX/Oracle/Implementation/ml-model/src/notebooks/../../oracle'

In [4]:
# Connection to the gpu (if available). Otherwise, the cpu is used
# In case the notebook runs in Google Colab, to connect the notebook to the gpu
# go to: Runtime --> Change runtime type, and set the hardware accelerator to
# gpu or tpu
if torch.cuda.is_available():
    # Set the gpu as device to perform the training
    DEVICE = torch.device("cuda:0")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: torch.cuda.get_device_name(0)')
else:
    # Set the cpu as device to perform the training
    DEVICE = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

No GPU available, using the CPU instead.


In [5]:
# Hyper-parameters
# 80% of the dataset will be used for training (the remaining 20% for validation)
TRAINING_RATIO = 0.8
# number of sequence in each batch
BATCH_SIZE = 16
# number of epochs to train the model
NUM_EPOCHS = 1
# number of steps after which computes the validation loss and accuracy
N_STEPS = 8
# learning rate for the gradient descent
LR = 0.0001

## Data Preprocessing

Loads dataset from specified `d_path`. Drops empty `label` column and fills null values with empty strings.

In [12]:
# read the dataset
dfs = []

for file_name in os.listdir(os.path.join(d_path, "dataset")):
    df = pd.read_json(os.path.join(d_path, "dataset", file_name))
    dfs.append(df)
df_dataset = pd.concat(dfs)
# drop column id (it is not relevant for training the model)
df_dataset = df_dataset.drop(['id'], axis=1)
# map empty cells to empty strings
df_dataset.fillna('', inplace=True)
# specify the type of each column in the dataset
df_dataset = df_dataset.astype({
    'label': 'bool',
    'oracleId': 'int64',
    'oracleType': 'string',
    'projectName': 'string',
    'packageName': 'string',
    'className': 'string',
    'javadocTag': 'string',
    'methodJavadoc': 'string',
    'methodSourceCode': 'string',
    'classJavadoc': 'string',
    'classSourceCode': 'string',
    'oracleSoFar': 'string',
    'token': 'string',
    'tokenClass': 'string',
    'tokenInfo': 'string'
})

In [13]:
df_dataset.shape

(4368, 15)

In [7]:
# delete the oracle ids and the tgt labels from the input dataset
df_src = df_dataset.drop(['oracleId','oracleType','label'], axis=1)
# create a dataframe for the oracle ids (we convert the boolean values to int64)
df_oracle_ids = df_dataset[['oracleId']]
# create a dataframe for the target labels (the apply function convert the
# boolean labels to 0s and 1s
df_tgt = df_dataset[['label']].replace({True: 1, False: 0})

## Tokenization

Loads `codebert-base` from `AutoTokenizer`. Tokenizes inputs feature-wise and creates `DataLoader` objects for `input_ids`, `attention_masks`, and `labels`.

In [8]:
%%capture
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

In [9]:
#
# The apply function maps each row of the src dataset with multiple columns, to
# a row with a single column containing the concatenation of the strings of each
# original column, using a token as a separator.
#
# For example:
#
#   1. Given the first row of the dataset:
#
#         projectName                                 "commons-collections4-4.1"
#         packageName                          "org.apache.commons.collections4"
#         className                                                    "Equator"
#         javadocTag                "@return whether the two objects are equal."
#         methodJavadoc      "/**\n     * Evaluates the two arguments for th..."
#         methodSourceCode                         "boolean equate(T o1, T o2);"
#         classJavadoc       "/**\n * An equation function, which determines..."
#         oracleSoFar                                                         ""
#         token                                                              "("
#         tokenClass                                        "OpeningParenthesis"
#         tokenInfo                                                           ""
#         notes                                                               ""
#
#   2. The statement gets the values of each column in an array (row.values)
#
#         ["commons-collections4-4.1","org.apache.commons.collections4",...,"",""]
#
#   3. The join method concatenates all the values in the array into a string,
#      using a special token (the *cls_token*) as separator.
#
#         commons-collections4-4.1<s>org.apache.commons.collections4<s>...<s>OpeningParenthesis<s><s>
#
# The result of step (3) represents the content of the unique column of the new
# map row. The process is repeated for each row in the src dataset.
df_src_concat = df_src.apply(lambda row: tokenizer.cls_token.join(row.values), axis=1)
# The pandas dataframe is transformed in a list of strings: each string is a input
# to the model
src_list = df_src_concat.to_numpy().tolist()
# We also transform the oracle ids and the targets to a list
oracle_ids_list = df_oracle_ids.to_numpy().tolist()
tgt_numpy = df_tgt.to_numpy().ravel()
tgt_classes = tgt_numpy.max() + 1
one_hot_tgt = np.zeros((len(tgt_numpy), tgt_classes))
one_hot_tgt[np.arange(len(tgt_numpy)), tgt_numpy] = 1
tgt_list = one_hot_tgt.tolist()

In [10]:
# Helper enum class: defines the types of sorting algorithm for the whole dataset.
class BatchType(str, Enum):
    HETEROGENEOUS = "HETEROGENEOUS"
    OMOGENEOUS = "OMOGENEOUS"
    RANDOM = "RANDOM"
    SORTED_BY_LENGTH = "SORTED_BY_LENGTH"

# Helper enum class: defines the types of dataset to create (training and
# validation)
class DatasetType(str, Enum):
    TRAINING = "TRAINING"
    VALIDATION = "VALIDATION"

## Data Processor

The **DataProcessor** class takes care of pre-processing the raw datasets to create the final datasets, efficiently sorted (with customizable criteria) and tokenized.

The datasets generated by the **DataProcessor** class can be feed to the **Dataset** and **DataLoader** PyTorch classes to easily load the data and prepare the batches to feed the model.

The tokenized datasets are sorted in a way that let the **Dataloader** to generate the final batches processing the datasets sequentially (using the **SequentialSampler** PyTorch helper class).

Although already tokenized, the datasets contains elements of different size (implicitly grouped by the length of the batches, previously defined by the hyperparameter **BATCH_SIZE**): the reason is that, given the fact that we use a pretrained hugginface model and tokenizer, we cannot rely on our own vocabulary and the standard PyTorch classes (such as **Field**, **TranslationDataset**, and **Iterator**) to efficiently generate the tokenized batch in a transparent way.

The tokenizer that we have to use relies on the codeBERT prebuilt vocabulary, and it tokenizes the whole dataset padding the inputs according to several criteria (for example the maximum length of the input in the dataset). This would mean that even input datapoints with short length would be padded to the length of the maximum one, generating a lot of overhead and heavily reducing the performance of the training and validation processes.

The tokenizer let to truncate the long inputs, but given our input datapoints, truncating the input probably should not be an option. Indeed, by cutting off information, we may be removing important context or information that the model needs to make accurate predictions. This can result in decreased performance and reduced quality of the model's output.

With the **DataProcessor** class we sort the data according to a given criteria (defined when the **DataProcessor** class is instantiated) and we already simulate the generation of batches so that we can tokenize batches of data instead of the whole dataset, reducing the padding to the longest input datapoint within each batch.

The *temporary* simulated batches are then flattened to a sorted list of datapoints so that when they will be processed by the **DataLoader** sequentially, it will build the real tensor batches in the same way.

In [11]:
class DataProcessor:
    '''
    The *DataProcessor* class takes care of pre-processing the raw datasets to create
    the final datasets, efficiently sorted (with customizable criteria) and tokenized.
    The datasets generated by the class can be feed to the *Dataset* and *DataLoader*
    PyTorch classes to easily load the data and prepare the batches to feed the model.

    Parameters
    ----------
    tokenizer : RobertaTokenizerFast
      The instance of tokenizer used to tokenize the input dataset
    src : list[str]
      The list of input datapoints (strings concatenated)
    o_ids : list[int]
      The list of oracle ids that identifies to which oracle each input datapoint refer
    tgt : list[int]
      The list of expected values (0 or 1) for each datapoint in input to the model
    batch_size: int
      The length of each batch of the training and validation dataset
    training_ratio: float
      A value in the interval [0.0,1.0] that represents the percentage of datapoints

    Attributes
    ------
    tokenizer : RobertaTokenizerFast
      The instance of tokenizer used to tokenize the input dataset
    src : list[str]
      The list of input datapoints (strings concatenated)
    o_ids : list[int]
      The list of oracle ids that identifies to which oracle each input datapoint refer
    tgt : list[int]
      The list of expected values (0 or 1) for each datapoint in input to the model
    batch_size: int
      The length of each batch of the training and validation dataset
    training_ratio: float
      A value in the interval [0.0,1.0] that represents the percentage of datapoints
    processed_dataset: dict
      Dictionary of the processed dataset:
          - d_sorted: contains the tuples of (input, oracle_id, target) datapoints
            after the original dataset has been sorted according to the selected criteria
          - b_train_tokenized: contains the list of batches for the training dataset,
            after the original dataset has been splitted (according to *training_ratio*
            value), grouped in batches, and tokenized
          - b_val_tokenized: contains the list of batches for the validation dataset,
            after the original dataset has been splitted (according to *training_ratio*
            value), grouped in batches, and tokenized
          - b_train: contains the list of batches for the training dataset, after
            the original dataset has been splitted (according to *training_ratio*
            value) and grouped in batches, but not tokenized yet
          - b_val: contains the list of batches for the training dataset, after
            the original dataset has been splitted (according to *training_ratio*
            value) and grouped in batches, but not tokenized yet
          - b_short_id: given the number of datapoints in the original dataset,
            the division in batches could be with rest. This means that one of the
            batch will contains less datapoints than the others. Given that we
            shuffle the batches during the dataprocessing, the class have to keep
            track of the short batch so that it can be processed properly, during
            the generation of the training and validation datasets
          - rand_short_id: it is a boolean value that establish if in case of
            a short batch, it must be kept as the last of the batches in the list,
            (rand_short_id set to False) or it can have a random position within
            the list (rand_short_id set to True)
    '''
    def __init__(self, tokenizer, src, o_ids, tgt, batch_size, training_ratio):
        if not (len(src) == len(o_ids) and len(src) == len(tgt)):
            raise Exception("[DataProcessor] the src, o_ids, and tgt lists must have the same length")
        self.tokenizer = tokenizer
        self.src = src
        self.o_ids = o_ids
        self.tgt = tgt
        self.batch_size = batch_size
        self.training_ratio = training_ratio
        self.processed_dataset = {
            "d_sorted": [],
            "b_train_tokenized": [],
            "b_val_tokenized": [],
            "b_train": [],
            "b_val": [],
            "b_short_id": -1,
            "rand_short_id": True
        }

    def get_tokenized_dataset(self, d_type):
        """
        The method Returns the final processed tokenized (training or validation) dataset.

        Parameters
        ----------
        d_type: DatasetType
            The dataset type that the method must return
                - DatasetType.TRAINING for the training dataset
                - DatasetType.VALIDATION for the validation dataset

        Returns
        -------
        t_dataset : TensorDataset
            A PyTorch TensorDataset composed of three tensors stack:
                - the first tensor stack representing the stack of tokenized input
                  datapoints of the whole sorted dataset
                - the second tensor stack representing the stack of attention masks
                  (each index corresponding to the index of the input datapoints in
                  the first tensor)
                - the third tensor stack representing the list of expected target
                  outputs (each index corresponding to the index of the input
                  datapoints in the first tensor)
        """
        # The list of batches of the tokenized (training or validation) datapoints
        if d_type == DatasetType.TRAINING:
            b_tokenized= self.processed_dataset["b_train_tokenized"]
        elif d_type == DatasetType.VALIDATION:
            b_tokenized= self.processed_dataset["b_val_tokenized"]
        else:
            raise Exception(f"Unrecognized DataType value: {d_type}")

        t_dataset_list = []
        # The batches are composed of tuples of (inputs, attention_masks, targets) s
        # tacks, where each element of the tuple is a stack of n datapoints, with
        # 1<=n<=*BATCH_SIZE*
        #
        #       t_batch = (
        #           [
        #               [t_i_1_1,...,t_i_1_n],
        #                        ...
        #               [t_i_k_1,...,t_i_k_n]
        #           ],
        #           [
        #               [m_1_1,...,m_1_n],
        #                        ...
        #               [m_k_1,...,m_k_n]
        #           ],
        #           [
        #               [t_1],
        #                ...
        #               [t_n]
        #           ]
        #       )
        #
        for t_batch in b_tokenized:
            # The list of inputs of the current batch
            t_src_batch = t_batch[0]
            # The list of attention masks of the current batch
            t_mask_batch = t_batch[1]
            # The list of targets of the current batch
            t_tgt_batch = t_batch[2]
            # Generate a dataset of the batch
            dataset_batch = TensorDataset(
                t_src_batch,
                t_mask_batch,
                t_tgt_batch
            )
            # Add the datasets of batches
            t_dataset_list.append(dataset_batch)
        # Concatenates the datasets of batches in a single dataset
        t_dataset = ConcatDataset(t_dataset_list)
        # return the dataset
        return t_dataset

    def process_dataset(self, batch_type):
        """
        This represents the core method of the class. Firstly, it sorts and maps the
        original dataset into batches, according to the criterion passed as parameter
        to the method itself (*batch_type*). Then, it splits the list of batches into
        the training and validation datasets. Finally it tokenizes the batches of the
        training and validation datasets

        Parameters
        ----------
        batch_type: BatchType
          The criterion type, according to which sort the input datapoints and generate
          the batches for the training and the validation datasets
        """
        # The dataset is sorted according to the batch type the PyTorch Dataloader will
        # have to pruduce
        if batch_type == BatchType.HETEROGENEOUS:
          self._sort_dataset_heterogeneously()
        elif batch_type == BatchType.OMOGENEOUS:
          self._sort_dataset_omogeneously()
        elif batch_type == BatchType.RANDOM:
          self._sort_dataset_randomly()
        elif batch_type == BatchType.SORTED_BY_LENGTH:
          self._sort_dataset_by_input_length()
        else:
          raise Exception("Batch type not recognized.")
        # The sorted dataset is grouped in batches, and the batches are splitted in
        # training and validation datasets
        self._generate_train_val_batches()
        # The batches of datapoints in the training and validation datasets are tokenized
        self._tokenize_batches()

    def _generate_train_val_batches(self):
        """
        The method splits the datapoints of the sorted dataset, and generate the batches
        for the training and the validation datasets.
        """
        # The number of datapoints that compose the entire dataset
        dp_len = len(self.processed_dataset["d_sorted"])
        # The number of datapoints that have to be assigned to the training dataset
        dp_train_len = math.floor(dp_len * self.training_ratio)
        # The number of datapoints that have to be assigned to the validation dataset
        dp_val_len = dp_len - dp_train_len
        # The total number of batches, given the total number of datapoints and the
        # batch size. The division is rounded to the smallest integer greater than or
        # equal to the result, because if there is a rest, the remaining datapoints
        # have to be grouped in an additional batch
        b_len = math.ceil(dp_len / self.batch_size)
        # The total number of batches, given the number of datapoints assigned to the
        # training dataset and the batch size.
        b_train_len = math.ceil(dp_train_len / self.batch_size)
        # The total number of batches, given the number of datapoints assigned to the
        # validation dataset and the batch size
        b_val_len = math.ceil(dp_val_len / self.batch_size)
        # the whole dataset is grouped in batches
        b_sorted_list = self._map_dataset_to_batches(b_len, dp_len)
        # The batches of the whole dataset are shuffled
        b_ids_shuffled_list = self._shuffle_sorted_batches_ids(b_len)

        # The list of batches that compose the training dataset
        b_train_list = []
        # The list of batches that compose the validation dataset
        b_val_list = []
        # The for loop assigns the *training_ratio* percentage of the shuffled batches
        # to the training dataset, while the remaining (1 - *training_ratio*) percentage
        # to the validation dataset. The number of batches assigned to the training
        # dataset is rounded up (for example if the number of batches is 22, and the
        # training ratio is 0.8 the number of batces assigned to the training is
        # 0.8 * 22 = 17.6 --> 18). The reason is that the number of batches assigned to
        # the training an validation datasets must be an integer. The number of batches
        # assigned to the validation dataset would be equal to (1 - 0.8) * 22 = 4.4 --> 5.
        # In principle, this seems to lead to an inconsistent result because (given
        # the example) the total number of batches would become:
        #
        #   b_train_len + b_val_len = 18 + 5 = 23 > 22
        #
        # But we can consider the fact that the "extra" batch is a batch shared between
        # both the training and the validation datasets. The 80% of its content would
        # be assigned to the training dataset, while the remaining 20% to the validation
        # dataset.
        # Moreover, the algorithm guarantees that the eventual batch splitted among the
        # training and the validation datasets will always be a full batch, composed of
        # BATCH_SIZE datapoints (if the whole dataset is composed of only one single
        # batch, this batch is used instead, even if it is not full).
        for idx, b_id in enumerate(b_ids_shuffled_list[:b_train_len],1):
            if not idx == b_train_len:
                b_train_list.append(b_sorted_list[b_id])
            else:
                # Check if the batch division has rest
                if dp_train_len % self.batch_size == 0:
                    # If the rest is 0 the whole batch is added to the training dataset
                    b_train_list.append(b_sorted_list[b_id])
                else:
                    # If the rest is not 0 the batch is splitted among the training
                    # and the validation datasets
                    rest = dp_train_len % self.batch_size
                    # The *training_ratio* percentage of the full batch is assigned to
                    # the training dataset
                    b_partial_train = b_sorted_list[b_id][:rest]
                    # The remaining datapoints of the full batch is assigned to the
                    # validation dataset
                    b_partial_val = b_sorted_list[b_id][rest:]
                    # Add the partially full batch to the training dataset batch list
                    b_train_list.append(b_partial_train)
                    # Add the remaining part of the batch to the validation dataset
                    # batch list
                    b_val_list.append(b_partial_val)
        # The for loop assigns the remaining batches to the validation dataset
        for b_id in b_ids_shuffled_list[b_train_len:]:
            b_val_list.append(b_sorted_list[b_id])
        # The generated lists of batches are stored in the *processed_dataset* dict
        # of the instance of the class
        self.processed_dataset["b_train"] = b_train_list
        self.processed_dataset["b_val"] = b_val_list

    def _map_dataset_to_batches(self, b_len, dp_len):
        """
        The method maps the datapoints of the sorted dataset into batches.

        Parameters
        ----------
        b_len: int
            The total number of batches within the whole dataset
        dp_len: int
            The total number of datapoints within the whole dataset
        """
        # The list of the whole sorted dataset
        d_sorted = self.processed_dataset["d_sorted"]
        # The rest of the division between the whole number of datapoints
        # and the batch size. The rest gives the number of datapoints in
        # the last batch (if the rest is 0 all the batches are full,
        # otherwise, there is a batch that will contain *rest* datapoints,
        # with rest < BATCH_SIZE
        rest = dp_len % self.batch_size
        # Boolean flag (True if there is a batch not full, False otherwise)
        rest_flag = not rest == 0

        # Check if there is a batch not full and the selected sorting
        # algorithm let's to have the partially full batch in a random
        # position
        if rest_flag and self.processed_dataset["rand_short_id"]:
            # Select the random index of the partially full batch
            short_batch_id = random.randrange(0,b_len)
            # Store the index of the partially full batch
            self.processed_dataset["b_short_id"] = short_batch_id
        else:
            # If the position of the partially full batch cannot be
            # random, get the position that it must have in the
            # list of batches
            short_batch_id = self.processed_dataset["b_short_id"]
        # Initialization of the batch list
        batches = []
        # Pointer of the position in the whole sorted dataset
        pointer = 0
        # The for cycle split the whole dataset into batches of size
        # *BATCH_SIZE*. If the index of the current batch is equal
        # to the index of the partially fulled batch (*short_batch_id*),
        # a batch of size *rest* is generated
        for i in range(b_len):
          new_pointer = pointer
          if rest_flag and i == short_batch_id:
            # Compute the end position of the current batch in the whole
            # dataset, if the current batch is the partially full batch
            # of size *rest*)
            new_pointer += rest
          else:
            # Compute the end position of the current batch in the whole
            # dataset, if the current batch is a full batch of size
            # *BATCH_SIZE*)
            new_pointer += self.batch_size
          # Extract batch datapoints from the whole sorted dataset
          batch = d_sorted[pointer: new_pointer]
          # Update pointer to the position of the whole sorted dataset
          # not yet splitted
          pointer = new_pointer
          # Add the batch to the list
          batches.append(batch)
        # Return the batch list
        return batches

    def _shuffle_sorted_batches_ids(self, b_len,seed=42):
        """
        The method shuffle the list of sorted batches. If there is a
        partially full batch within the list, it is always positioned
        at the end of the list (otherwise the following partition of
        the batches between the training and the validation)

        Parameters
        ----------
        b_len: int
            The total number of batches in the whole dataset
        seed: int
            The seed through which the batch indices are randomly shuffled.
            This parameter let to reproduce the same shuffled batches.
        """
        # If the list has a partially full batch, get the index
        # The value is -1 if there is not a short batch (all the
        # batches in the list are full)
        short_batch_id = self.processed_dataset["b_short_id"]
        # Generate a sorted list of indices
        b_ids = [i for i in range(b_len)]
        # Check if the partially full batch exists
        if not short_batch_id == -1:
            # Remove the index of the short batch
            b_ids.remove(short_batch_id)
        # Set the seed to reproduce the same shuffle in future
        random.seed(seed)
        random.shuffle(b_ids)
        # Check if the partially full batch exists
        if not short_batch_id == -1:
            # Add the index of the short batch at the end of the list
            b_ids.append(short_batch_id)
        # Return the shuffled list of batch indices
        return b_ids

    def _sort_dataset_heterogeneously(self):
        """
        The method sorts the original dataset distributing the datapoints heterogenously.
        This sorting will let the dataloader to produce batches that minimize the
        number of datapoints within each batch, referring to the same oracle (ideally
        only 0 or 1 datapoint referring to the same oracle, for each batch)
        """
        # Let's first create tuples of input, oracle ids and targets.
        # This operation is necessary to maintain the conformity between the inputs,
        # the oracle ids, and the targets that compose the dataset.
        # Otherwise, if we only sort the inputs sentences, without sorting
        # the corresponding oracle ids and targets, we lose the corrispondence between
        # the input, the oracle id, and expected output.
        # Therefore, given the list of the sentences, the list of the oracle ids, and
        # the list of targets:
        #
        #     self.src = [s_1,s_2,...,s_n], where s_i is the input i within the dataset
        #     self.o_ids = [o_1,o_2,...,o_n], where o_i is the oracle id i associated to the input i
        #     self.tgt = [t_1,t_2,...,t_n], where t_i is the label associated to the input i
        #
        # the zip statement produces a list of (s_i, o_i, t_i) tuples:
        #
        #     src_tgt_zip = [(s_1,o_1,t_1),(s_2,o_1,t_2),...,(s_n,o_1,t_n)]
        #
        s_o_t_zip = list(zip(self.src, self.o_ids, self.tgt))

        # Length of the whole dataset
        dp_len = len(self.src)
        # Number of total batches, given the whole dataset
        b_len = math.ceil(dp_len / self.batch_size)
        # Dictionary of the number of occurrences of each oracle id within the whole dataset.
        # The Counter class, given a list, computes the number of occurrences for each different
        # value in the list. For example, given the list
        #
        #       o_ids = [1,2,2,4,3,2,1,3,2,4]
        #
        # The Counter class returns the following dictionary:
        #
        #       occurrences_counter = {
        #           1: 2,
        #           2: 4,
        #           3: 2,
        #           4: 2
        #       }
        #
        occurrences_counter = Counter(self.o_ids)
        # Dictionary of empty lists, one for each oracle id in the dataset. each list, will be
        # filled with the datapoints that refer to the same oracle id. Given the previous
        # example of *o_ids* and *occurrences_counter*, the *occurrence* dictionary will be
        # initialized in this way:
        #
        #       occurrences = {
        #           1: [],
        #           2: [],
        #           3: [],
        #           4: []
        #       }
        #
        occurrences = { k: [] for k in occurrences_counter.keys()}
        # The list of the final sorted tuples (s_i,o_i,t_i)
        s_o_t_zip_sorted = []

        # The for cycle fills the occurrences of tuples of datapoints in the dataset that
        # refers to the same oracle id
        #
        #       occurrences = {
        #           1: [(s_1_1,o_1_1,t_1_1),(s_1_2,o_1_2,t_1_2)]
        #           2: [(s_2_1,o_2_1,t_2_1),...,(s_2_4,o_2_4,t_2_4)],
        #           3: [(s_3_1,o_3_1,t_3_1),(s_3_2,o_3_2,t_3_2)],
        #           4: [(s_4_1,o_4_1,t_4_1),(s_4_2,o_4_2,t_4_2)]
        #       }
        #
        for input, oracle_id, target in s_o_t_zip:
            occurrences[oracle_id].append((input,oracle_id,target))

        # The for cycle sorts the dataset simulating the filling of the batches of the
        # whole dataset, trying to create heterogeneous batches
        for i in range(b_len):
            #  The list of distinct oracle ids still available
            o_avail = list(occurrences_counter.keys())
            # Fills the batch of *BATCH_SIZE* length
            for j in range(self.batch_size):
                #  Check id the list of distinct oracle ids still available is empty
                if len(o_rand) == 0:
                    # If the list is empty try to refill it with the distinct oracle ids
                    # still available in the counter dictionary
                    o_avail = list(occurrences_counter.keys())
                    # Check if the list is empty even after the re-filling
                    if o_avail == 0:
                        # If the list is empty stops the algorithm
                        break
                o_id_rand = random.choice(o_avail)
                o_rand = random.choice(occurrences[o_id_rand])
                o_avail.remove(o_id_rand)
                occurrences[o_id_rand].remove(o_rand)
                occurrences_counter[o_rand] -= 1
                if occurrences_counter[o_rand] == 0:
                  assert len(occurrences[o_id_rand]) == 0, "Occurences should be empty."
                  del occurrences_counter[o_id_rand]
                s_o_t_zip_sorted.append(o_rand)
        self.processed_dataset["d_sorted"] = s_o_t_zip_sorted
        self.processed_dataset["b_short_id"]: b_len - 1
        self.processed_dataset["rand_short_id"]: False

    def _sort_dataset_omogeneously(self):
        """
        The method sorts the dataset in a way that let the dataloader to produce
        batches with the highest number of datapoints referring to the same oracle
        """
        # Let's first create tuples of input, oracle ids and targets.
        # This operation is necessary to maintain the conformity between the inputs,
        # the oracle ids, and the targets that compose the dataset.
        # Otherwise, if we only sort the inputs sentences, without sorting
        # the corresponding oracle ids and targets, we lose the corrispondence between
        # the input, the oracle id, and expected output.
        # Therefore, given the list of the sentences, the list of the oracle ids, and
        # the list of targets:
        #
        #     self.src = [s_1,s_2,...,s_n], where s_i is the input i within the dataset
        #     self.o_ids = [o_1,o_2,...,o_n], where o_i is the oracle id i associated to the input i
        #     self.tgt = [t_1,t_2,...,t_n], where t_i is the label associated to the input i
        #
        # the zip statement produces a list of (s_i, o_i, t_i) tuples:
        #
        #     src_tgt_zip = [(s_1,o_1,t_1),(s_2,o_1,t_2),...,(s_n,o_1,t_n)]
        #
        s_o_t_zip = list(zip(self.src, self.o_ids, self.tgt))
        # The tuples are sorted using as key the second element of each tuple, which means
        # the oracle id of each input.
        s_o_t_zip_sorted = sorted(s_o_t_zip, key=lambda t: t[1])
        self.processed_dataset["d_sorted"] = s_o_t_zip_sorted

    def _sort_dataset_randomly(self, seed=42):
        """
        The method shuffle the original dataset in a random way

        Parameters
        ----------
        seed: int
          The seed through which the datapoints of the dataset are randomly shuffled.
          This parameter let to reproduce the same shuffled batches.
        """
        # The batches are randomly sampled from the dataset
        s_o_t_zip = list(zip(self.src, self.o_ids, self.tgt))
        # Set the seed to reproduce the same shuffle in future
        random.seed(seed)
        # Shuffle the datasets tuples randomly
        random.shuffle(s_o_t_zip)
        self.processed_dataset["d_sorted"] = s_o_t_zip

    def _sort_dataset_by_input_length(self):
        """
        The method sorts the dataset in a way that let the dataloader to produce
        batches of data composed of inputs with similar length. In this way, when
        the class will tokenize and embed the batches, it will reduce the padding in each
        batch, improving the performance of the model training (padding is used to
        create sentences with the same length, but they do not contribute to the
        training of the model and they introduce overhead)
        """
        # The method sort the dataset in a way that let the dataloader to produce
        # batches of data composed of inputs with similar length. In this way, when
        # we will tokenize and embed the batches, we will reduce the padding in each
        # batch, improving the performance of the model training (padding is used to
        # create sentences with the same length, but they do not contribute to the
        # training of the model and they introduce overhead).
        #
        # Let's first create tuples of input, oracle ids and targets.
        # This operation is necessary to maintain the conformity between the inputs,
        # the oracle ids, and the targets that compose the dataset.
        # Otherwise, if we only sort the inputs sentences, without sorting
        # the corresponding oracle ids and targets, we lose the corrispondence between
        # the input, the oracle id, and expected output.
        # Therefore, given the list of the sentences, the list of the oracle ids, and
        # the list of targets:
        #
        #     self.src = [s_1,s_2,...,s_n], where s_i is the input i within the dataset
        #     self.o_ids = [o_1,o_2,...,o_n], where o_i is the oracle id i associated to the input i
        #     self.tgt = [t_1,t_2,...,t_n], where t_i is the label associated to the input i
        #
        # the zip statement produces a list of (s_i, o_i, t_i) tuples:
        #
        #     src_tgt_zip = [(s_1,o_1,t_1),(s_2,o_1,t_2),...,(s_n,o_1,t_n)]
        #
        s_o_t_zip = list(zip(self.src, self.o_ids, self.tgt))
        # The tuples are sorted using as key the first element of each tuple, which means
        # the oracle id of each input.
        s_o_t_zip_sorted = sorted(s_o_t_zip, key=lambda t: t[0])
        self.processed_dataset["d_sorted"] = s_o_t_zip_sorted

    def _tokenize_batches(self):
        """
        The method tokenizes the input datapoints of the batches that composes the
        training and validation datasets
        """
        # The batch list of the training dataset
        b_train = self.processed_dataset["b_train"]
        # The batch list of the validation dataset
        b_val = self.processed_dataset["b_val"]
        for d_type in DatasetType:
            if d_type == DatasetType.TRAINING:
                batches = b_train
            elif d_type == DatasetType.VALIDATION:
                batches = b_val
            else:
                raise Exception(f"Unrecognized DataType value: {d_type}")
            for batch in batches:
                # Extracts the inputs datapoints from the batch
                b_inputs = [ t[0] for t in batch ]
                # Extracts the corresponding targets datapoints from the batch
                b_targets = [ t[2] for t in batch ]
                # Computes the length of the longest input datapoint within the batch
                max_len = reduce(lambda max_len, s: len(s) if len(s) > max_len else max_len, b_inputs,0)
                # Tokenize the inputs datapoints of the batch
                # The method generate a dictionary with two keys:
                #
                #   t_src_dict = {
                #       "input_ids": [[t_i_1_1,...,t_i_1_n],...,[t_i_k_1,...,t_i_k_n]],
                #       "attention_mask": [[m_1_1,...,m_k_n],...,[m_k_1,...,m_k_n]]
                #   }
                #
                # where each element in the *input_ids* list is a list of tokenized words
                # (the words of an input datapoint), while each element in the *attention
                # masks* is the corresponding mask vector to distinguish the real tokens
                # from the padding tokens. In the example, t_i_x_y is the y tokenized word
                # of the input datapoint x, and m_x_y is a boolean value that states if
                # the token y is a real word or a padding token
                #
                t_src_dict = self.tokenizer(
                  b_inputs,
                  max_length=max_len,
                  padding='max_length',
                  truncation=True
                )
                # Transform the list into a tensor stack
                #
                #   t_src_dict['input_ids'] = [[t_i_1_1,...,t_i_1_n],...,[t_i_k_1,...,t_i_k_n]]
                #
                #   t_inputs = tensor([
                #           [t_i_1_1,...,t_i_1_n],
                #                   ...
                #           [t_i_k_1,...,t_i_k_n]
                #   ])
                #
                # this is the structure accepted by the DataLoader, to process the dataset
                t_inputs = torch.stack([torch.tensor(ids) for ids in t_src_dict['input_ids']])
                # Transform the list into a tensor stack
                t_attention_masks = torch.stack([torch.tensor(mask) for mask in t_src_dict['attention_mask']])
                # Transform the targets into a tensor list
                targets_tensor = torch.tensor(b_targets)
                if d_type == DatasetType.TRAINING:
                    # Add the tuple representing the tokenized batch to the list of training dataset
                    self.processed_dataset["b_train_tokenized"].append((t_inputs, t_attention_masks, targets_tensor))
                elif d_type == DatasetType.VALIDATION:
                    # Add the tuple representing the tokenized batch to the list of validation dataset
                    self.processed_dataset["b_val_tokenized"].append((t_inputs, t_attention_masks, targets_tensor))
                else:
                    raise Exception(f"Unrecognized DataType value: {d_type}")

In [12]:
# Create DataProcessor instance
data_processor = DataProcessor(tokenizer,src_list,oracle_ids_list,tgt_list,BATCH_SIZE,TRAINING_RATIO)
# Process the data
data_processor.process_dataset(BatchType.RANDOM)
# Get the train and validation sorted datasets
train_dataset = data_processor.get_tokenized_dataset(DatasetType.TRAINING)
val_dataset = data_processor.get_tokenized_dataset(DatasetType.VALIDATION)


## Dataloader

DataLoader is a pytorch class that takes care of shuffling/sampling/weigthed
sampling, batching, and using multiprocessing to load the data, in an efficient
and transparent way.
We define a dataloader for both the training and the validation dataset.

The dataloader generates the real batches of datapoints that we will use to
feed the model.

We use an helper PyTorch class, **SequentialSampler**, to create the batches
selecting the datapoints sequentially, from the training and validation datasets.
Indeed, we used the **DataProcessor** class to sort the dataset in specific way,
simulating the creation of batches of data before the **DataLoader**, minimizing
the padding (in the case of *BatchType.HOMOGENEOUS*) or maximizing the
diversity within the dataset (in the case of *BatchType.HETEROGENEOUS*). The
use of the **SequentialSampler** will guarantee to maintain this criteria for
the creation of the batches.


In [13]:
# Creation of the training and validation dataloaders
dl_train = DataLoader(
    train_dataset,
    sampler=SequentialSampler(train_dataset),
    batch_size=BATCH_SIZE
)
dl_val = DataLoader(val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=BATCH_SIZE)

## Model
The **OracleClassifier** represents our fine-tuned model.

The architecture of the model is composed of:

1. The pre-trained codeBERT Transformer model
2. A fully-connected layer that takes in input the output from the codebert model (which represents our hidden state) and maps this vector to a a vector of two elements (representing our 0 and 1 scores)
3. The softmax activation function that transforms the output vector of the fully-connected layer into a vector of n probabilities, given the n classes of the classification task (in our case, 2). The softmax activation function is computed implicitly, during the training phase, by the loss function (the PyTorch **CrossEntropy** class, in our case). Therefore, the softmax is not a visible layer of the model.

In [14]:
class OracleClassifier(Module):
    """
    The model is composed of:
        1.  The pretrained codeBERT model
        2.  A fully connected layer that takes in input the output from the
            codebert model (which represents our hidden state) and maps this vector
            to a a vector of two elements (representing our 0 and 1 scores).
    N.B.: The vector in output to our model is not normalized, therefore each
          element does not represent the probability that the token is 0 or 1
          by itself. The normalization is performed by the cross entropy loss
          function, which performs the softmax.

    Parameters
    ----------
    max_input_len: int
        The length of the longest input in the whole dataset
    """
    def __init__(self, max_input_len=512, device=None):
        super(OracleClassifier, self).__init__()
        # First layer of the model: the pre-trained codeBERT model
        self.codebert_transformer = AutoModel.from_pretrained("microsoft/codebert-base")
        # Setup of the pre-trained model layer
        self.codebert_transformer.config.max_position_embeddings = max_input_len
        self.codebert_transformer.base_model.embeddings.position_ids = torch.arange(max_input_len).expand((1, -1))
        self.codebert_transformer.base_model.embeddings.token_type_ids = torch.zeros(max_input_len).expand((1, -1)).int()
        orig_pos_emb = self.codebert_transformer.base_model.embeddings.position_embeddings.weight
        self.codebert_transformer.base_model.embeddings.position_embeddings.weight = torch.nn.Parameter(torch.cat((orig_pos_emb, orig_pos_emb)))

        # The last layer of the pre-trained codeBERT model. It is necessary to
        # get the size of the output layer and understand the size of the next
        # fully-connected layer (the output vector of the pre-trained codeBERT
        # model will represent the input vector of the fully-connected layer)
        hidden_size = self.codebert_transformer.config.to_dict()['hidden_size']
        # Second layer of the model: the fully-connected layer.
        # The size of the input is equal to the dimension of the output vector
        # of the pre-trained codeBERT model, while the size of the output is
        # a vector of two elements (the two classes of our classifier)
        self.linear = Linear(hidden_size, 2)

    def forward(self, input_ids, input_masks):
        """
        The method feed the model with the stack of inputs and the attention masks.
        It returns the stack of output vectors of the last fully-connected layer.
        Each output vector of the stack represents the non-normalized vector of
        the probabilities that the corresponding input belongs to each class of
        the classification task.

        For example:

            input_ids = tensor([
                            [t_i_1_1,...,t_i_1_n],
                                     ...
                            [t_i_k_1,...,t_i_k_n]
                        ])

        where each row of the stack is a tokenized input (t_i_1_1 is the first
        token of the first word of the first input of the batch).

            input_masks = tensor([
                            [m_1_1,...,m_1_n],
                                     ...
                            [m_k_1,...,m_k_n]
                          ])

        where each row of the stack is the corresponding attention mask of the
        input with the same index in the input_ids stack.

        The output is a stack of the form:

            output = tensor([
                        [p_1,p_2],
                           ...
                        [p_1,p_2]
                     ])

        where each row of the stack is the corresponding output of the input with
        the same index in the input_ids stack. The row is a vector composed of two
        elements (in our case) and p_1 and p_2 represents the non-normalized
        probabilities that the input belongs respectively to the first and the
        second class of the classificator (the softmax computed in the loss function,
        during the training phase transforms p_1 and p_2 into normalized values,
        i.e. real probability values whose sum is 1).

        Parameters
        ----------
        input_ids: Tensor
            The tensor stack of the inputs within the batch passed to the model
            in the training or validation phase
        input_masks: Tensor
            The tensor stack of the attention masks within the batch passed to
            the model in the training or validation phase

        Returns
        -------
        output: Tensor
            The tensor stack of the outputs of the model
        """
        output = self.codebert_transformer(input_ids, input_masks)
        output = output.pooler_output
        output = self.linear(output)
        return output

In [15]:
%%capture
# We compute the maximum length of the input datapoints, within the whole dataset
# This let us to guarantee that the model will process input data of this length
# The +2 is given by the fact that the model add the start token and the end token
# to each input of the model.
max_input_len = reduce(lambda max_len, s: len(s) if len(s) > max_len else max_len, src_list,0) + 2
# Creation of the instance of the model
model = OracleClassifier(max_input_len)
# The model is loaded on the gpu (or cpu, if not available)
model.to(DEVICE)

## Training

The **OracleTrainer** class is an helper class that is used to perform the training
and the validation phases of the model. During the training phase, the model uses the
batches of data to compute the loss and update the weights to improve the accuracy of
the predictions. Instead, in the validation phase the trainer use batches of the
validation dataset to evaluate how the model is able to generalize on unseen data.
During the validation phase the weights of the model are not updated.

In [16]:
class OracleTrainer():
    """
    The *OracleTrainer* class is an helper class that, given the loss
    function, the optimizer, the model, and the training and validation
    datasets perform the training of the model and computes the loss and
    the accuracy of the training and validation phases, saves the statistics
    of the training, and have auxiliary methods that let to visualize the
    trend of the training and the validation, over the epochs.

    Parameters
    ----------
    model: OracleClassifier
            The model to train
    loss_fn:
        The loss function to compute the loss, during the training and
        validation phases
    optimizer:
        The optimizer used to perform the backpropagation and updates
        the weights of the model
    dl_train: DataLoader
        The training dataloader which contains the batches of datapoints
        for the training phase
    dl_val: DataLoader
        The validation dataloader which contains the batches of datapoints
        for the validation phase
    """
    def __init__(self, model, loss_fn, optimizer, dl_train, dl_val):
        self.model = model
        self.dl_train = dl_train
        self.dl_val = dl_val
        self.loss_fn = loss_fn
        self.optimizer = optimizer

    def train(self, num_epochs, break_end = 99.9, best_time = math.inf):
        """
        The method perform the training and validation phases of the model.

        Parameters
        ----------
        num_epochs: int
            The number of epochs to train the model
        break_end:
            The accuracy threshold to stop the training
        best_time:
            Best time for statistics performance
        """
        print("Start Training...")

        # Dictionary of the statistics
        stats = {
            't_loss': [],
            'v_loss': [],
            't_accuracy': [],
            'v_accuracy': []
        }
        steps = 0
        accumulation_steps = 8
        flag_90 = False
        flag_end = False
        time_over = False

        # In each epoch the trainer train the model batch by batch,
        # with all the batch of the training dataset. After a given
        # number of *accumulation_steps* the trainer performs the
        # backpropagation and updates the weights accordingly.
        # Moreover, it computes the accuracy and the total loss of
        # the training, and performs the validation to understand how
        # well the model generalize on the validation data.
        for epoch in range(1, num_epochs +1):
            total_loss = 0
            total_accuracy = 0
            trained_total = 0
            predicted_correct = 0

            start = timeit.default_timer()

            # model in training mode
            self.model.train()
            self.optimizer.zero_grad()

            for step, batch in enumerate(self.dl_train):
                print(f"processing step {step+1} of {len(self.dl_train)}")
                steps += 1

                # Extract the inputs, the attention masks and the expected
                # outputs from the batch
                src_input = batch[0].to(DEVICE)
                masks_input = batch[1].to(DEVICE)
                tgt_out = batch[2].to(DEVICE)

                # Train the model
                outputs = self.model(src_input, masks_input)

                # Compute the loss
                loss = self.loss_fn(outputs, tgt_out)
                loss.backward()

                # Exctract the predicted values and the expected output
                with torch.no_grad():
                    _, predicted = outputs.max(1)
                    _, expected_out = tgt_out.max(1)
                # Update the accuracy of the model, given the predictions
                # of the batch
                trained_total += tgt_out.size(0)
                predicted_correct += (predicted == expected_out).sum().item()

                if (steps % accumulation_steps) == 0:
                    # Update the weights of the model
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    # Update the total loss
                    total_loss += loss.item()
                    # Compute the accuracy of the model within the accumulation
                    # steps
                    total_accuracy = 100 * predicted_correct/trained_total
                    # Reset the counter for the accuracy
                    trained_total = 0
                    predicted_correct = 0

                if (steps % N_STEPS) == 0:
                    # Compute average statistics for the loss and the accuracy
                    mean_t_loss = total_loss / (N_STEPS / accumulation_steps)
                    mean_t_accuracy = total_accuracy / (N_STEPS / accumulation_steps)

                    # Validation phase
                    mean_v_loss, mean_v_accuracy = self.validation()

                    # Update the statistics
                    stats['t_loss'].append(mean_t_loss)
                    stats['t_accuracy'].append(mean_t_accuracy)
                    stats['v_loss'].append(mean_v_loss)
                    stats['v_accuracy'].append(mean_v_accuracy)

                    # Print the statistics
                    self.print_stats(
                        epoch,
                        num_epochs,
                        step + 1,
                        len(self.dl_train),
                        mean_t_loss,
                        mean_t_accuracy,
                        mean_v_loss,
                        mean_v_accuracy
                    )

                    # Reset the total loss and accuracy
                    total_loss = 0
                    total_accuracy = 0
                    interval = timeit.default_timer()

                    # Breakpoint validation accuracy
                    if mean_v_accuracy > 90 and (not flag_90):
                      flag_90 = not flag_90
                      print('-'*30)
                      print("BREAKPOINT 90% VALIDATION ACCURACY")
                      print('-'*30)
                      print(f"TIME: {int(interval - start)}seconds")
                      print('-'*30)
                      stats['time_90'] = int(interval - start)

                    # Breakpoint validation accuracy - stop the
                    # training to avoid overfitting
                    if mean_v_accuracy > break_end:
                      flag_end = not flag_end
                      interval = timeit.default_timer()
                      print('-'*30)
                      print("BREAKPOINT 100% VALIDATION ACCURACY")
                      print('-'*30)
                      print(f"TIME: {int(interval - start)} seconds")
                      print('-'*30)
                      print('-'*30)
                      print(f"FINAL SAMPLES")
                      print('-'*30)
                      stats['time_100'] = int(interval - start)
                      break

                    # If time is over, stop the training
                    if int(interval - start) > best_time or int(interval - start) > 6000:
                      time_over = True
                      stats['time_100'] = math.inf
                      break
            if flag_end or time_over:
                break
        return stats

    def print_stats(self,epoch,num_epochs, step, total_steps, mean_t_loss, mean_t_accuracy, mean_v_loss, mean_v_accuracy):
        """
        The method prints the statistics of the training and validation phases

        Parameters
        ----------
        epoch: int
            The current epoch of the training
        num_epochs: int
            The total number of epochs
        step: int
            The current step of the training phase, in the current epoch
        total_steps: int
            The total number of steps within an epoch
        mean_t_loss: float
            Average training loss
        mean_t_accuracy: float
            Average training accuracy
        mean_v_loss: float
            Average validation loss
        mean_v_accuracy:
            Average validation accuracy
        """
        print('-'*30)
        print("STATISTICS")
        print('-'*30)
        print(f"EPOCH: [{epoch} / {num_epochs}]")
        print(f"STEP: [{step} / {total_steps}]")
        print(f"TRAINING LOSS: {(mean_t_loss):.4f}")
        print(f"TRAINING ACCURACY: {(mean_t_accuracy):.2f}%")
        print('-'*30)
        print(f"VALIDATION LOSS: {(mean_v_loss):.4f}")
        print(f"VALIDATION ACCURACY: {(mean_v_accuracy):.2f}%")
        print('-'*30)

    def plot_loss_accuracy(self,steps, ax, stats):
        """
        The method plots the trend of the loss and the accuracy over the epochs

        Parameters
        ----------
        steps: int
            The number of steps
        ax:
            The axes of the matplotlib figure
        stats: dict
            The dictionary of the statistics
        """
        for i in range(2):
            for j in range(2):
                title = ('Training ' if j == 0 else 'Validation') + ('Loss' if i == 0 else 'Accuracy')
                dict_label = ('t_' if j == 0 else 'v_') + ('loss' if i == 0 else 'accuracy')
                color = 'blue'
                ax[i][j].set_title(title, fontsize=30)
                ax[i][j].set_xlabel("steps", fontsize=30)
                ax[i][j].set_ylabel("loss", fontsize=30)
                ax[i][j].plot(range(0,len(stats['t_loss'])*N_STEPS,N_STEPS), np.array(stats[dict_label])[:], '-', color=color)
        for i in range(2):
            title = 'Training and Validation ' + ('Loss' if i == 0 else 'Accuracy')
            dict_label = 'loss' if i == 0 else 'accuracy'
            ax[2][i].set_title(title, fontsize=30)
            ax[2][i].set_xlabel("steps", fontsize=30)
            ax[2][i].set_ylabel("loss", fontsize=30)
            ax[2][i].plot(range(0,len(stats['t_loss'])*N_STEPS,N_STEPS), np.array(stats['t_' + dict_label])[:], '-', color='blue')
            ax[2][i].plot(range(0,len(stats['t_loss'])*N_STEPS,N_STEPS), np.array(stats['v_' + dict_label])[:], '-', color='orange')

    def validation(self):
        """
        The method computes the validation phase.

        Returns
        -------
        mean_v_loss: float
            Average loss of the validation phase
        mean_v_accuracy:
            Average accuracy of the validation phase
        """
        # model in evaluation mode
        self.model.eval()

        total_loss = 0
        total_accuracy = 0
        trained_total = 0
        predicted_total = 0
        total_steps = 0
        # The validation phase is performed without accumulating
        # the gradient descent and without updating the weights
        # of the model
        with torch.no_grad():
            for batch in iter(self.dl_val):
                total_steps += 1
                # Extract the inputs, the attention masks and the
                # targets from the batch
                src_input = batch[0].to(DEVICE)
                masks_input = batch[1].to(DEVICE)
                tgt_out = batch[2].to(DEVICE)
                # Feed the model
                outputs = self.model(src_input, masks_input)
                # Compute the loss
                loss = self.loss_fn(outputs, tgt_out)
                total_loss += loss.item()
                # Exctract the predicted values and the expected output
                with torch.no_grad():
                    _, predicted = outputs.max(1)
                    _, expected_out = tgt_out.max(1)
                # Update the accuracy of the model, given the predictions
                # of the batch
                trained_total += tgt_out.size(0)
                predicted_total += (predicted == expected_out).sum().item()
                # Update the accuracy
                total_accuracy += 100 * predicted_total/trained_total
        # Compute the average validation loss
        mean_v_loss = total_loss / len(self.dl_val)
        # Compute the average validation accuracy
        mean_v_accuracy = total_accuracy / len(self.dl_val)
        return mean_v_loss, mean_v_accuracy

In [17]:
# Adam optimizer with learning rate set with the value of the LR hyperparameter
optimizer = optim.Adam(model.parameters(), lr=LR)
# The cross-entropy loss function is commonly used for classification tasks
loss_fn = CrossEntropyLoss()
# Instantiation of the trainer
oracle_trainer = OracleTrainer(model,loss_fn,optimizer,dl_train,dl_val)

In [18]:
stats = {}

try:
    # Train the model
    stats = oracle_trainer.train(NUM_EPOCHS)
except RuntimeError as e:
    print("Runtime Exception...")
    torch.cuda.empty_cache()
    raise e

# Check if the directory exists, to save the statistics of the training
output_dir = os.path.join(d_path, "output")
if not os.path.exists(output_dir):
    # If the path does not exists, create it
    os.makedirs(output_dir)
# Save the statistics in json format
with open(os.path.join(output_dir, f"loss_accuracy_{BATCH_SIZE}_{LR}_{NUM_EPOCHS}.json", "w")) as loss_file:
    data = {
        **stats,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "num_epochs": NUM_EPOCHS
    }
    json.dump(data, loss_file)
# Close the file
loss_file.close()

Start Training...
processing step 1 of 110
processing step 2 of 110
processing step 3 of 110
processing step 4 of 110
processing step 5 of 110
processing step 6 of 110



KeyboardInterrupt



## Save the statistics and the trained model

Saves the statistics for future analysis, and the trained model for future use or improvements.
Saving the model we save the values of all the weights. In other words, we create a snapshot of
the state of the model, after the training.

In [None]:
torch.save(model.state_dict(), os.path.join(output_dir, "tratto_model.pt"))

## Load the model

Commands to load the model to future uses.

In [None]:
model.load_state_dict(torch.load(os.path.join(output_dir, "tratto_model.pt")))
# Put the model in eval mode to use it in predictions
model.eval()