# IMPORT

In [42]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset, load_metric
from transformers import DataCollatorWithPadding
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [39]:
### Model Parameters
# we will use with Distil-BERT
language_model_name = "distilbert-base-uncased"
language_model_name2 = "bert-base-uncased"

### Training Argurments
# this GPU should be enough for this task to handle 32 samples per batch
batch_size = 32

# optim
learning_rate = 1e-4
weight_decay = 0.001 # we could use e.g. 0.01 in case of very low and very high amount of data for regularization

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)
nli_dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli",trust_remote_code=True)
adversarial_set = load_dataset("iperbole/adversarial_fever_nli")



### METRIC DEFINITION

# Metrics
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    load_precision = load_metric("precision")
    load_recall = load_metric("recall")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    precision = load_precision.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = load_recall.compute(predictions=predictions, references=labels, average="weighted")["recall"]

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}








In [21]:
# torch.save(model,'model1.pth')
# torch.save(model2,'model3.pth')

In [22]:
# del(model1)
# del(model2)

In [40]:
# MODEL
## Initialize the model
model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3) # number of the classes to change to 3


model2 = AutoModelForSequenceClassification.from_pretrained(language_model_name2,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3) # number of the classes to change to 3




tokenizer = AutoTokenizer.from_pretrained(language_model_name)
tokenizer2 = AutoTokenizer.from_pretrained(language_model_name2)

# padding with the most long sentence!
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # avoid to use can reduce the memory on GPU
data_collator2 = DataCollatorWithPadding(tokenizer=tokenizer2) # avoid to use can reduce the memory on GPU

#examples are batch!
def tokenize_function(examples):
    examples["label"] = [labels_mapping[label] for label in examples["label"]]
    return tokenizer(examples["premise"], examples["hypothesis"],padding = True, truncation=True)




# Tokenize the dataset ...
print("Tokenize the dataset ...")
labels_mapping = {"ENTAILMENT":0, "CONTRADICTION":1, "NEUTRAL":2 }
tokenized_datasets_nli = nli_dataset.map(tokenize_function, batched=True)
tokenized_datasets_nli_2 = nli_dataset.map(tokenize_function, batched=True)
tokenized_adversarial_dataset = adversarial_set.map(tokenize_function, batched=True)

print(tokenized_datasets_nli["train"][0])


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize the dataset ...


Map:   0%|          | 0/2287 [00:00<?, ? examples/s]

{'id': '150448', 'premise': "Roman Atwood . He is best known for his vlogs , where he posts updates about his life on a daily basis . His vlogging channel , `` RomanAtwoodVlogs '' , has a total of 3.3 billion views and 11.9 million subscribers . He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .", 'hypothesis': 'Roman Atwood is a content creator.', 'label': 0, 'wsd': {'premise': [{'index': 0, 'text': 'Roman', 'pos': 'ADJ', 'lemma': 'roman', 'bnSynsetId': 'bn:00109913a', 'wnSynsetOffset': '2921569a', 'nltkSynset': 'roman.a.01'}, {'index': 1, 'text': 'Atwood', 'pos': 'PROPN', 'lemma': 'Atwood', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 2, 'text': '.', 'pos': 'PUNCT', 'lemma': '.', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 3, 'text': 'He', 'pos': 'PRON', 'lemma': 'he', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 4, 'text': 'is', 'pos': 'AUX', 'lemma': 'be', 'bnSynset

In [41]:

#MODELS TRAINING

training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",                           # save the model
    learning_rate=learning_rate,                  # learning rate
    gradient_checkpointing = True                 # to reduce memory usage
    # fp16 = True                                 # to reduce more memory usage
)

training_args2 = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",                           # save the model
    learning_rate=learning_rate,                  # learning rate
    gradient_checkpointing = True,              # to reduce memory usage
    # fp16 = False                                 # to reduce more memory usage
)

# Train DistilBERT
trainer1 = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_nli["train"],
    eval_dataset=tokenized_datasets_nli["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer1.train()

# Train BERT
trainer3 = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=tokenized_datasets_nli_2["train"],
    eval_dataset=tokenized_datasets_nli_2["validation"],
    tokenizer=tokenizer2,
    data_collator=data_collator2,
    compute_metrics=compute_metrics,
)
trainer3.train()




  0%|          | 0/1597 [00:00<?, ?it/s]



KeyboardInterrupt: 

In [35]:
# Get predictions from DistilBERT
# predictions1 = trainer1.predict(tokenized_datasets_nli["validation"]).predictions
# probs1 = torch.nn.functional.softmax(torch.tensor(predictions1), dim=-1).numpy()


# # Get predictions from BERT
# predictions3 = trainer3.predict(tokenized_datasets_nli_3["validation"]).predictions
# probs3 = torch.nn.functional.softmax(torch.tensor(predictions3), dim=-1).numpy()


# Get predictions from DistilBERT
predictions1 = trainer1.predict(tokenized_adversarial_dataset["test"]).predictions
probs1 = torch.nn.functional.softmax(torch.tensor(predictions1), dim=-1).numpy()


# Get predictions from BERT
predictions2 = trainer3.predict(tokenized_adversarial_dataset["test"]).predictions
probs2 = torch.nn.functional.softmax(torch.tensor(predictions2), dim=-1).numpy()


  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

In [36]:
# print(predictions1)
print(probs1)

[[0.28671902 0.25789323 0.4553877 ]
 [0.2506978  0.20299646 0.5463058 ]
 [0.92324656 0.0273671  0.04938633]
 ...
 [0.06172665 0.6760523  0.26222104]
 [0.16637705 0.23884837 0.59477454]
 [0.24277243 0.5033937  0.25383392]]


In [28]:
print(probs2)


[[0.32372493 0.25584534 0.42042968]
 [0.98101085 0.00346965 0.01551953]
 [0.41557115 0.11741773 0.46701112]
 ...
 [0.96817666 0.01148273 0.02034066]
 [0.17438652 0.56347656 0.26213694]
 [0.06813377 0.3216241  0.6102421 ]]


In [37]:
average_probs = (probs1 + probs2) / 2
ensemble_predictions = np.argmax(average_probs, axis=-1)


In [38]:


# Load metrics
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")

# Compute metrics
# labels = tokenized_datasets_nli["validation"]["label"]
labels = tokenized_adversarial_dataset["test"]["label"]

accuracy = accuracy_metric.compute(predictions=ensemble_predictions, references=labels)["accuracy"]
f1 = f1_metric.compute(predictions=ensemble_predictions, references=labels, average="weighted")["f1"]
precision = precision_metric.compute(predictions=ensemble_predictions, references=labels, average="weighted")["precision"]
recall = recall_metric.compute(predictions=ensemble_predictions, references=labels, average="weighted")["recall"]

print({"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall})


{'accuracy': 0.5370919881305638, 'f1': 0.5398555478694804, 'precision': 0.5553070753923801, 'recall': 0.5370919881305638}
