# IMPORT

In [1]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from datasets import load_metric
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [3]:
### Model Parameters
# we will use with Distil-BERT
language_model_name = "distilbert-base-uncased"

### Training Argurments

# this GPU should be enough for this task to handle 32 samples per batch
batch_size = 32

# optim
learning_rate = 1e-4
weight_decay = 0.001 # we could use e.g. 0.01 in case of very low and very high amount of data for regularization

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)

nli_dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli",trust_remote_code=True)
adversarial_set = load_dataset("iperbole/adversarial_fever_nli")



### METRIC DEFINITION

# Metrics
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels,average="weighted")["f1"]
   return {"accuracy": accuracy, "f1": f1}








In [5]:
# MODEL
## Initialize the model
model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3) # number of the classes to change to 3

tokenizer = AutoTokenizer.from_pretrained(language_model_name)

# padding with the most long sentence!
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # avoid to use can reduce the memory on GPU

#examples are batch!
def tokenize_function(examples):
    examples["label"] = [labels_mapping[label] for label in examples["label"]]
    return tokenizer(examples["premise"], examples["hypothesis"],padding = True, truncation=True)




# Tokenize the dataset ...
print("Tokenize the dataset ...")
labels_mapping = {"ENTAILMENT":0, "CONTRADICTION":1, "NEUTRAL":2 }
tokenized_datasets_nli = nli_dataset.map(tokenize_function, batched=True)
tokenized_adversarial_dataset = adversarial_set.map(tokenize_function, batched=True)

print(tokenized_datasets_nli["train"][0])


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize the dataset ...


Map:   0%|          | 0/2287 [00:00<?, ? examples/s]

Map:   0%|          | 0/337 [00:00<?, ? examples/s]

{'id': '150448', 'premise': "Roman Atwood . He is best known for his vlogs , where he posts updates about his life on a daily basis . His vlogging channel , `` RomanAtwoodVlogs '' , has a total of 3.3 billion views and 11.9 million subscribers . He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .", 'hypothesis': 'Roman Atwood is a content creator.', 'label': 0, 'wsd': {'premise': [{'index': 0, 'text': 'Roman', 'pos': 'ADJ', 'lemma': 'roman', 'bnSynsetId': 'bn:00109913a', 'wnSynsetOffset': '2921569a', 'nltkSynset': 'roman.a.01'}, {'index': 1, 'text': 'Atwood', 'pos': 'PROPN', 'lemma': 'Atwood', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 2, 'text': '.', 'pos': 'PUNCT', 'lemma': '.', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 3, 'text': 'He', 'pos': 'PRON', 'lemma': 'he', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 4, 'text': 'is', 'pos': 'AUX', 'lemma': 'be', 'bnSynset

In [6]:

#MODEL TRAINING

training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",                           # save the model
    learning_rate=learning_rate,                  # learning rate
    gradient_checkpointing = True                 # to reduce memory usage
    # fp16 = True                                 # to reduce more memory usage
)


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_datasets_nli["train"],
   eval_dataset=tokenized_datasets_nli["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


trainer.train()

  0%|          | 0/1597 [00:00<?, ?it/s]



{'loss': 0.6619, 'grad_norm': 3.862337350845337, 'learning_rate': 0.0001, 'epoch': 0.31}
{'loss': 0.4778, 'grad_norm': 3.6819708347320557, 'learning_rate': 5.44211485870556e-05, 'epoch': 0.63}
{'loss': 0.4284, 'grad_norm': 7.138038158416748, 'learning_rate': 8.842297174111212e-06, 'epoch': 0.94}
{'train_runtime': 1595.9478, 'train_samples_per_second': 32.01, 'train_steps_per_second': 1.001, 'train_loss': 0.5150247715978079, 'epoch': 1.0}


TrainOutput(global_step=1597, training_loss=0.5150247715978079, metrics={'train_runtime': 1595.9478, 'train_samples_per_second': 32.01, 'train_steps_per_second': 1.001, 'total_flos': 6753498842284992.0, 'train_loss': 0.5150247715978079, 'epoch': 1.0})

In [12]:

validation_results = trainer.evaluate()
test_results = trainer.evaluate(eval_dataset=tokenized_datasets_nli['test'])

print(f"validatioon => {validation_results}")
print(f"test => {test_results}")

  0%|          | 0/286 [00:00<?, ?it/s]

  0%|          | 0/286 [00:00<?, ?it/s]

validatioon => {'eval_loss': 0.7188450694084167, 'eval_accuracy': 0.7355769230769231, 'eval_f1': 0.7272669394266372, 'eval_runtime': 12.4568, 'eval_samples_per_second': 183.674, 'eval_steps_per_second': 22.959, 'epoch': 1.0}
test => {'eval_loss': 0.7704156637191772, 'eval_accuracy': 0.7039790118058592, 'eval_f1': 0.6946643382506382, 'eval_runtime': 12.0497, 'eval_samples_per_second': 189.798, 'eval_steps_per_second': 23.735, 'epoch': 1.0}


In [13]:
test_augmented_result = trainer.evaluate(tokenized_adversarial_dataset['test'])
print("adversarial test => ",test_augmented_result)

  0%|          | 0/43 [00:00<?, ?it/s]

adversarial test =>  {'eval_loss': 1.354268193244934, 'eval_accuracy': 0.49258160237388726, 'eval_f1': 0.4937813379136295, 'eval_runtime': 2.084, 'eval_samples_per_second': 161.705, 'eval_steps_per_second': 20.633, 'epoch': 1.0}
