In [1]:
from google.colab import drive
from tqdm.auto import tqdm
import torch
import os

drive.mount('/content/drive')
os.chdir("drive/MyDrive/IT_big_bird")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
# Remember to restart the kernel after the installation!
!pip install tokenizers
!pip install transformers
!pip install mlflow
!pip install datasets
!pip install sentencepiece

# Load OSCAR Dataset

We can download the OSCAR corpus dataset wichi contains around 90gb of data at the time. Since this is a very huge amount of data we can use the **streaming** parameter provided by the datasets library. Doing this operation we are able to stream the dataset without directly download all into our memory (in colab is limitated to 60gb).

In [2]:
import datasets

dataset = datasets.load_dataset('oscar', "unshuffled_deduplicated_it", split='train',
                                streaming=True, shuffled=True)

dataset = dataset.with_format("torch")

Using custom data configuration unshuffled_deduplicated_it-shuffled=True


# Create new tokenizer

To do this operation instead of creating a completly new BPE tokeizer with sentencepiece we can take advantage of "train_new_from_iterator" provide directly from huggingface tokenizer library. Doing this operation we are able to load previous pretrained tokenizer for the model that we are using (in this case one bigbird from google) and train a new one starting from it.

The main advantage of this process is that the tokenizer comes with all the necessary configuration for that particular model that we want to train. Since the main goal is to create a BigBird for italian sentence we can keep them without changing any of the google provided configuration. 

In [3]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
f"vocab size from original bigbird {old_tokenizer.vocab_size}"

'vocab size from original bigbird 50358'

Since we are working with a streaming datasets wich is too big for our demostration purpose we can create an iterable function that allow us to limit the number of records that will be used for training our tokenizer.

the function *train_new_from_iteration* takes an iteration which is the function that we've just create and the vocabulary size. in this case we are keeping the same from the original bigbird tokenizer. 

In [None]:
import datetime as dt

LEN_TRAINIG = 2_000_000

def get_training_corpus():
    for idx, text in enumerate(dataset.take(LEN_TRAINIG)):
      if (idx % 100_000 == 0): 
        print(idx, "time: ", dt.datetime.now().strftime('%H:%M:%S'))
      yield text["text"]

tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), old_tokenizer.vocab_size)
tokenizer.save_pretrained('model/tokenizer')

0 time:  12:43:28
100000 time:  12:48:06
200000 time:  12:53:01
300000 time:  12:58:23
400000 time:  13:04:39
500000 time:  13:09:48
600000 time:  13:15:00
700000 time:  13:19:59
800000 time:  13:25:05
900000 time:  13:29:26
1000000 time:  13:35:35
1100000 time:  13:39:43
1200000 time:  13:44:28
1300000 time:  13:48:38
1400000 time:  13:53:15
1500000 time:  13:57:32
1600000 time:  14:01:23
1700000 time:  14:06:24
1800000 time:  14:10:40
1900000 time:  14:16:07


We can now load and test how our new tokenizer behave.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('model/tokenizer')

for idx,val in enumerate(dataset):
  tokens = tokenizer.encode_plus(val['text'])
  print(tokens['input_ids'])
  print(tokenizer.decode(tokens['input_ids']))
  if idx > 10: break
  

# DataCollator for Masking tokens

Since we are training our model on the Masked Language task we have to mask some the tokens. Basically what we are doing consist of creating a trainining set where some word in the sentences will be randomly masked.

In our case we use the default *mlm_probability* which is equal to 0.15, this means that the 15% of the tokens will be masked during the training process.

Then we create the Torch DataLoader we recall that torch class Dataset stores the samples and their corresponding labels, while DataLoader wraps an iterable around the Dataset to enable easy access to the samples.

In [None]:
from transformers import DataCollatorForLanguageModeling

FREQ_OF_MASKING = 0.15

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=FREQ_OF_MASKING)
dataloader = torch.utils.data.DataLoader(dataset, collate_fn=data_collator, batch_size=8)

In [None]:
from transformers import (BigBirdConfig, BigBirdForMaskedLM)



config = BigBirdConfig()
  
model = BigBirdForMaskedLM(config=config)
tokenizer = BigBirdTokenizerFast()

ValueError: ignored

In [None]:
class CustomDataset(Dataset):
    sent_mapping = {'neg':0, 'neu':1, 'pos':2}
    def __init__(self, dataframe, tokenizer, max_len=140):
        self._tokenizer = tokenizer
        self.text = dataframe.text
        self.targets = dataframe.sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    @property
    def tokenizer(self):
      return self._tokenizer

    @property
    def get_labels(self):
      return self.labels

    def pre_process_text(self,row):
      row = re.sub(r'@\S+|http\S*|\!|\?|#|RT+|\d+|\.*|\,*|\-*|@|\[|\]|\(|\)|\:|\;|\+|\*|\-|\_|\"', '', row).strip()
      row = row.lower()
      return row

    def __getitem__(self, index):
        comment_text = self.text[index]
        comment_text = self.pre_process_text(comment_text)
        
        inputs = self._tokenizer.encode_plus(
            comment_text,
            # None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_token_type_ids=False,
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return ({
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
                })
      


def create_dataset(train_raw, test_raw, tokenizer, tokenizer_lenght, BATCH_SIZE):
  training_set = CustomDataset(train_raw, tokenizer, tokenizer_lenght)
  eval_set = CustomDataset(test_raw, tokenizer, tokenizer_lenght)
  return (DataLoader(training_set, batch_size= BATCH_SIZE, shuffle=True), DataLoader(eval_set, batch_size= BATCH_SIZE, shuffle=True))

In [None]:
from sklearn.metrics import classification_report, accuracy_score

class Trainer:
  def __init__(self, model, loss_fn, epoch, path, optimizer, scheduler):
    self.model = model
    self.loss_fn = loss_fn
    self.epochs = epoch 
    self.optimizer = optimizer
    self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
    self.scheduler = scheduler
    self.path = path

  def training_loop(self, training_loader, testing_loader, run_name, **kwargs):
    self.model.to(self.device)
    with mlflow.start_run(run_name=run_name) as run:  
      for key, value in kwargs.items():
        mlflow.log_param(key, value)
      self.train(training_loader, testing_loader)
      mlflow.pytorch.log_model(self.model, self.model.name)

  def handle_metrics_multiclass(self, metrics, output, truth):
    accuracy = accuracy_score(output.cpu(), truth.cpu())
    batch_metric = classification_report(output.cpu(), truth.cpu(), 
                                         target_names=['neg','neu','pos'],
                                         labels=[0, 1, 2], 
                                         output_dict=True, zero_division=0)
    metrics['accuracy'] += accuracy
    metrics['macro avg']["f1-score"] += batch_metric['macro avg']["f1-score"]
    metrics['macro avg']["precision"] += batch_metric['macro avg']["precision"]
    metrics['neg']["precision"] += batch_metric['neg']["precision"]
    metrics['neg']["f1-score"] += batch_metric['neg']["f1-score"]
    metrics['neg']["recall"] += batch_metric['neg']["recall"]
    metrics['neu']["precision"] += batch_metric['neu']["precision"]
    metrics['neu']["f1-score"] += batch_metric['neu']["f1-score"]
    metrics['neu']["recall"] += batch_metric['neu']["recall"]
    metrics['pos']["precision"] += batch_metric['pos']["precision"]
    metrics['pos']["f1-score"] += batch_metric['pos']["f1-score"]
    metrics['pos']["recall"] += batch_metric['pos']["recall"]

    metrics['step'] += 1

  
  def log_multiple_metrics(self, metrics, step, prefix='train'):
      for key, value in metrics.items():
        if key != 'step':
          if type(value) == dict:
            for key_inside, value_inside in metrics[key].items():
              mlflow.log_metric(f"{prefix}_{key}_{key_inside}", (value_inside/metrics['step']), step=step)
          elif key == 'loss':
           mlflow.log_metric(f"{prefix}_{key}", (value), step=step)
          else:
           mlflow.log_metric(f"{prefix}_{key}", (value/metrics['step']), step=step)

      
  def train(self, training_loader, testing_loader):
        step = 0
        for epoch in (range(self.epochs)):
          self.model.train()

          metrics_train = {
              "step":0, "accuracy":0, 
              "macro avg": {"f1-score":0, "precision":0},
              "neg": {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
              "neu": {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
              "pos": {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0}}

          total, correct = 0,0
          for idx, data in tqdm(enumerate(training_loader), total=len(training_loader)):
              ids = data['ids'].to(self.device, dtype = torch.long)
              mask = data['mask'].to(self.device, dtype = torch.long)
              targets = data['targets'].to(self.device, dtype = torch.long)

              outputs = self.model(ids, mask) 
              self.optimizer.zero_grad()
              loss = self.loss_fn(outputs, targets)

              _, predicted = torch.max(outputs.data, 1)
              total += targets.size(0)
              correct += (predicted == targets).sum().item()

              self.handle_metrics_multiclass(metrics_train, predicted, targets)

              if idx%1000 == 0:
                  metrics_train['loss'] = loss.item()
                  self.log_multiple_metrics(metrics_train, step=step, prefix='train')
                  step += 1

              if idx % 20_000 == 0 and idx != 0:
                self.scheduler.step()

              self.optimizer.zero_grad()
              loss.backward()
              self.optimizer.step()

          train_accuracy = correct/total
          mlflow.log_metric("train accuracy__",train_accuracy, step=epoch)

          self.log_multiple_metrics(metrics_train, step=step, prefix='train')
          self.validation(testing_loader, epoch)
        

  def validation(self, testing_loader, epoch):
    self.model.eval()

    metrics_test = {
    "step":0, "accuracy":0, 
    "macro avg": {"f1-score":0, "precision":0},
    "neg": {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
    "neu": {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
    "pos": {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0}}
    
    total, correct = 0,0
    with torch.no_grad():
        for idx, data in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            ids = data['ids'].to(self.device, dtype = torch.long)
            mask = data['mask'].to(self.device, dtype = torch.long)
            targets = data['targets'].to(self.device, dtype = torch.long)

            outputs = self.model(ids, mask) 

            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()  

            self.handle_metrics_multiclass(metrics_test, predicted, targets)

    mlflow.log_metric("test accuracy__", correct/total, step=epoch)

    self.log_multiple_metrics(metrics_test, step=epoch, prefix='test')

    torch.save(self.model, f"{self.path}{self.model.name}.pt")

    torch.save({"optimizer": self.optimizer.state_dict(), 
                "scheduler": self.scheduler.state_dict(), "epoch": epoch}, 
                f"{self.path}{self.model.name}-re_train_args.pt")

In [None]:
train = True

if train:
  !databricks configure --host https://community.cloud.databricks.com/
  mlflow.set_tracking_uri('databricks')
  mlflow.set_experiment("/Users/gabriele.ghisleni01@gmail.com/Bertino_Tweets")

In [None]:
train = True
if train:
  PARAMS = {"epochs":4, "lr":1e-04, "batch_size":16, "model_name": 'eng_distilber', 'gamma':0.96,
            "tokenizer_max_lenght":25, 'trainable_layers':[], "train-test": (len(train_df), len(test_df))}

  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
  training_loader, testing_loader = create_dataset(train_df, test_df, tokenizer, PARAMS['tokenizer_max_lenght'], PARAMS['batch_size'])

  model = DistilBertTweet(name=PARAMS['model_name'])
  loss_fn = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(params=model.parameters(), lr=PARAMS['lr'])
  scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=PARAMS['gamma'])

  # model = torch.load('./model/bert_simple.pt')
  # optimizer.load_state_dict(torch.load('./model/bert_simple-optimizer.pt'))
  
  print(model, device)

  for name, param in model.named_parameters():
    if "bert" in name: param.requires_grad = False
    if param.requires_grad:
      PARAMS['trainable_layers'].append(name)
      print(f"Layer to train --> {name}")
  
  print(PARAMS)
  trainer = Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, epoch=PARAMS['epochs'],  path='./model/')
  trainer.training_loop(training_loader, testing_loader, run_name=PARAMS['model_name'], **PARAMS)