<a href="https://colab.research.google.com/github/DreRnc/ExplainingExplanations/blob/ModData/Base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset : **E-SNLI**. \
Model : **Base T5**.

In [2]:
%load_ext autoreload
%autoreload 2
colab = False

In [3]:
if colab:
    !git clone https://github.com/DreRnc/ExplainingExplanations.git
    %cd ExplainingExplanations
    !git checkout seq2seq
    %pip install -r requirements_colab.txt
    

# 1.0 Preparation


Set parameters for the experiments.

In [4]:
MODEL = 't5-small'
    
sizes = {
    'n_train' : 500000,
    'n_val' : 9842,
    'n_test' : 9824
}

# Whether to use the mnli prompt on which the model is pretrained or not
USE_MNLI_PROMPT = False
EXPLANATION_FIRST = False

## 1.1 Loading Tokenizer

In [5]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(MODEL, truncation=True, padding=True)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 1.2 Loading and Tokenizing Dataset

In [6]:
from datasets import load_dataset
from src.preprocess import prepare_dataset
from functools import partial
from src.utils import tokenize_function

In [7]:
dataset = load_dataset("esnli", download_mode="force_redownload")

Downloading data: 100%|██████████| 39.3M/39.3M [00:02<00:00, 19.0MB/s]
Downloading data: 100%|██████████| 1.62M/1.62M [00:00<00:00, 9.14MB/s]
Downloading data: 100%|██████████| 1.61M/1.61M [00:00<00:00, 9.86MB/s]
Generating train split: 100%|██████████| 549367/549367 [00:00<00:00, 2068775.24 examples/s]
Generating validation split: 100%|██████████| 9842/9842 [00:00<00:00, 1223266.16 examples/s]
Generating test split: 100%|██████████| 9824/9824 [00:00<00:00, 1503936.14 examples/s]


In [8]:
tokenize_mapping = partial(tokenize_function, tokenizer=tokenizer, use_mnli_format = USE_MNLI_PROMPT)

In [9]:
train_tok, valid_tok, test_tok = prepare_dataset(dataset, tokenize_mapping=tokenize_mapping, sizes = sizes)

Map:   1%|          | 4000/500000 [00:00<00:28, 17577.89 examples/s]

Map: 100%|██████████| 500000/500000 [00:27<00:00, 18173.63 examples/s]
Map: 100%|██████████| 9842/9842 [00:00<00:00, 16844.71 examples/s]
Map: 100%|██████████| 9824/9824 [00:00<00:00, 17985.44 examples/s]


## 1.3 Loading SBERT for evaluating sentence similarity

In [10]:
from sentence_transformers import SentenceTransformer

In [11]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# 2.0 Tasks

In [12]:
import torch
from functools import partial
import evaluate
from src.utils import compute_metrics, eval_pred_transform_accuracy
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


In [14]:
import os
if not os.path.exists("results.txt"):
      with open("results.txt", 'w') as file:
           file.write("Model :" + MODEL + '\n')
else:
      with open("results.txt", 'a') as file:
           file.write("Model :" + MODEL + '\n')

In [15]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
device

device(type='cuda')

In [None]:
transform_accuracy = partial(eval_pred_transform_accuracy, tokenizer = tokenizer)
compute_accuracy = partial(compute_metrics, pred_transforms=transform_accuracy, metrics = evaluate.load('accuracy'))

In [16]:
standard_args = {
    "save_strategy" : "steps",
    "save_steps" : 4688,

    "save_total_limit" : 1,
    "load_best_model_at_end" : True,
    "metric_for_best_model" : "accuracy",
    "greater_is_better" : True,

    "evaluation_strategy" : "steps",
    "eval_steps" : 4688,
    
    "predict_with_generate" : True,
    "per_device_train_batch_size" : 16,
    "per_device_eval_batch_size" : 16,
}

## 2.1 Task 1: Zero-shot evaluation

In [None]:
directory_1 = 'task1_' + MODEL 

In [16]:
with open("results.txt", 'a') as file:
    file.write('\n' + "Task 1 : Zero-shot" + '\n')

In [17]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
training_args = Seq2SeqTrainingArguments(
    **standard_args,
    output_dir=directory_1,
    generation_max_length=32
)

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    compute_metrics=compute_accuracy,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [20]:
trainer.evaluate(test_tok)



Number of predictions not in [entailment, neutral, contradiction]: 724


{'eval_loss': 0.27994176745414734,
 'eval_accuracy': 0.6550285016286646,
 'eval_runtime': 23.3073,
 'eval_samples_per_second': 421.499,
 'eval_steps_per_second': 13.172}

## 2.2 Task 2: Fine tuning without explanations

In [None]:
directory_2 = 'task2_' + MODEL 

In [21]:
with open("results.txt", 'a') as file:
    file.write('\n' + "Task 2 : Fine-tune without explanations" + '\n')

In [22]:
NUM_EPOCHS = 3

In [23]:
model_ft = T5ForConditionalGeneration.from_pretrained(MODEL)
data_collator_ft = DataCollatorForSeq2Seq(tokenizer, model=model_ft)

In [24]:
training_args_ft = Seq2SeqTrainingArguments(
    **standard_args,
    num_train_epochs = NUM_EPOCHS,
    output_dir=directory_2,
    generation_max_length=32,
)

In [25]:
trainer_ft = Seq2SeqTrainer(
    model=model_ft,
    args=training_args_ft,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    compute_metrics=compute_accuracy,
    data_collator=data_collator_ft,
    tokenizer=tokenizer,
)

In [26]:
trainer_ft.train()



Step,Training Loss,Validation Loss,Accuracy
4688,0.1679,0.135591,0.846169
9376,0.1543,0.127669,0.85694
14064,0.1531,0.123013,0.862325
18752,0.1418,0.118088,0.864357
23440,0.1418,0.116261,0.866795
28128,0.1334,0.115114,0.869844
32816,0.1298,0.112538,0.872485
37504,0.1324,0.111761,0.8734
42192,0.1319,0.111921,0.874314


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=46875, training_loss=0.14511486083984376, metrics={'train_runtime': 3292.2195, 'train_samples_per_second': 455.62, 'train_steps_per_second': 14.238, 'total_flos': 2.284430883500851e+16, 'train_loss': 0.14511486083984376, 'epoch': 3.0})

In [None]:
best_model_dir = directory_2 + '/best_model'
trainer_ft.save_model(best_model_dir)

In [27]:
with open("results.txt", 'a') as file:
    file.write("Test: " + '\n')

In [None]:
trainer_ft.evaluate(test_tok)

## 2.3 Task 3: Fine Tuning with Explanations

In [None]:
if EXPLANATION_FIRST:
    directory_3 = "task3b" + MODEL
else:
    directory_3 = "task3" + MODEL

We need to give as labels the label and the explanation tokenized.

In [16]:
with open("results.txt", 'a') as file:
    file.write('\n' + "Task 3 : Fine-tune with explanations" + '\n')

### Preparing the dataset with labelled explanations

In [18]:
from src.utils import tokenize_function_ex

In [25]:
dataset_explanations = load_dataset("esnli", download_mode="force_redownload")

Downloading data: 100%|██████████| 39.3M/39.3M [00:01<00:00, 29.7MB/s]
Downloading data: 100%|██████████| 1.62M/1.62M [00:00<00:00, 10.1MB/s]
Downloading data: 100%|██████████| 1.61M/1.61M [00:00<00:00, 9.83MB/s]
Generating train split: 100%|██████████| 549367/549367 [00:00<00:00, 4133397.80 examples/s]
Generating validation split: 100%|██████████| 9842/9842 [00:00<00:00, 2834604.13 examples/s]
Generating test split: 100%|██████████| 9824/9824 [00:00<00:00, 3047244.68 examples/s]


In [26]:
tokenize_mapping_ex = partial(tokenize_function_ex, tokenizer=tokenizer, use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)

In [27]:
train_tok_ex, valid_tok_ex, test_tok_ex = prepare_dataset(dataset=dataset_explanations, tokenize_mapping=tokenize_mapping_ex, sizes=sizes)

Map:   1%|          | 6000/500000 [00:00<00:38, 12974.80 examples/s]

Map: 100%|██████████| 500000/500000 [00:38<00:00, 13006.54 examples/s]
Map: 100%|██████████| 9842/9842 [00:00<00:00, 12674.62 examples/s]
Map: 100%|██████████| 9824/9824 [00:00<00:00, 11851.08 examples/s]


In [28]:
train_tok_ex.features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'explanation_1': Value(dtype='string', id=None),
 'explanation_2': Value(dtype='string', id=None),
 'explanation_3': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

### Defining the metrics: accuracy / similarity of explanations

In [35]:
from src.utils import eval_pred_transform_sbert
from src.sbert_metric import SbertMetric

In [36]:
transform_accuracy_ex = partial(eval_pred_transform_accuracy, tokenizer = tokenizer, remove_explanations_from_label = True, explanation_first = EXPLANATION_FIRST)
accuracy = evaluate.load('accuracy')

In [37]:
transform_sbert = partial(eval_pred_transform_sbert, tokenizer = tokenizer, explanation_first = EXPLANATION_FIRST)
sbert_similarity = SbertMetric(sbert)

In [38]:
transforms = [transform_accuracy_ex, transform_sbert]
metrics = [accuracy, sbert_similarity]

compute_metrics_ex = partial(compute_metrics, pred_transforms=transforms, metrics=metrics)

### Fine Tuning

In [62]:
NUM_EPOCHS = 3

In [63]:
model_ft_ex = T5ForConditionalGeneration.from_pretrained(MODEL)
data_collator_ft_ex = DataCollatorForSeq2Seq(tokenizer, model=model_ft_ex)

In [64]:
training_args_ft_ex = Seq2SeqTrainingArguments(
    **standard_args,
    num_train_epochs = NUM_EPOCHS,
    output_dir= directory_3,
    generation_max_length=128
)

In [65]:
trainer_ft_ex = Seq2SeqTrainer(
    model=model_ft_ex,
    args=training_args_ft_ex,
    train_dataset=train_tok_ex,
    eval_dataset=valid_tok_ex,
    compute_metrics=compute_metrics_ex,
    data_collator=data_collator_ft_ex,
    tokenizer=tokenizer,
)

In [66]:
trainer_ft_ex.train()



Step,Training Loss,Validation Loss,Accuracy,Explanation Average Similarity
4688,1.2357,1.152379,0.805832,0.642265
9376,1.1637,1.102244,0.812945,0.648976
14064,1.1317,1.076633,0.827677,0.655816
18752,1.1022,1.061733,0.835806,0.657609




: 

In [None]:
best_model_dir = directory_3 + '/best_model'
trainer_ft_ex.save_model(best_model_dir)

In [None]:
with open("results.txt", 'a') as file:
    file.write("Test: " + '\n')

In [None]:
trainer_ft_ex.evaluate(test_tok_ex)

Number of predictions not in [entailment, neutral, contradiction]: 1


{'eval_loss': 0.8937565684318542,
 'eval_accuracy': 0.8764250814332247,
 'eval_explanation_average_similarity': 0.6723642945289612,
 'eval_runtime': 199.5865,
 'eval_samples_per_second': 49.222,
 'eval_steps_per_second': 1.538,
 'epoch': 3.0}

## 2.4 Task 4: Fine Tuning with Shuffled Explanations

In [None]:
if EXPLANATION_FIRST:
    directory_4 = "task4b" + MODEL
else:
    directory_4 = "task4" + MODEL

In [26]:
with open("results.txt", 'a') as file:
    file.write('\n' + "Task 4 : Fine-tune with shuffled explanations" + '\n')

### Preparing the dataset with *wrong* labelled explanations

In [27]:
dataset_shex = load_dataset("esnli", download_mode="force_redownload")

Downloading data: 100%|██████████| 39.3M/39.3M [00:01<00:00, 29.0MB/s]
Downloading data: 100%|██████████| 1.62M/1.62M [00:00<00:00, 9.09MB/s]
Downloading data: 100%|██████████| 1.61M/1.61M [00:00<00:00, 9.08MB/s]
Generating train split: 100%|██████████| 549367/549367 [00:00<00:00, 4934464.20 examples/s]
Generating validation split: 100%|██████████| 9842/9842 [00:00<00:00, 2935803.99 examples/s]
Generating test split: 100%|██████████| 9824/9824 [00:00<00:00, 2971003.14 examples/s]


In [28]:
from src.preprocess import save_explanations, save_shuffled_explanations, retrieve_explanations

In [29]:
dirs = save_explanations(dataset_shex)

In [30]:
dir_train_shuffled = save_shuffled_explanations(dirs[0])

In [31]:
shuffled_explanations_train = retrieve_explanations(dir_train_shuffled)

In [32]:
from src.utils import tokenize_function_ex

tokenize_mapping_train = partial(tokenize_function_ex, tokenizer=tokenizer, explanations = shuffled_explanations_train, use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)
tokenize_mapping_val = partial(tokenize_function_ex, tokenizer=tokenizer, use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)
tokenize_mapping_test = partial(tokenize_function_ex, tokenizer=tokenizer, use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)

tokenize_mappings = (tokenize_mapping_train, tokenize_mapping_val, tokenize_mapping_test)

In [33]:
train_tok_shex, valid_tok_shex, test_tok_shex = prepare_dataset(dataset, tokenize_mapping=tokenize_mappings, sizes=sizes)

Map: 100%|██████████| 500000/500000 [00:38<00:00, 13150.97 examples/s]
Map: 100%|██████████| 9842/9842 [00:00<00:00, 12788.36 examples/s]
Map: 100%|██████████| 9824/9824 [00:00<00:00, 12852.82 examples/s]


In [34]:
train_tok_shex = train_tok_shex.remove_columns(["explanation_1", "explanation_2", "explanation_3"])
valid_tok_shex = valid_tok_shex.remove_columns(["explanation_1", "explanation_2", "explanation_3"])
test_tok_shex = test_tok_shex.remove_columns(["explanation_1", "explanation_2", "explanation_3"])

### Fine Tuning

In [35]:
NUM_EPOCHS = 3

In [37]:
model_ft_shex = T5ForConditionalGeneration.from_pretrained(MODEL)
data_collator_ft_shex = DataCollatorForSeq2Seq(tokenizer, model=model_ft_shex)

In [38]:
training_args_ft_shex = Seq2SeqTrainingArguments(
    **standard_args,
    num_train_epochs=NUM_EPOCHS,
    output_dir=directory_4,
    generation_max_length=128,
)

In [39]:
trainer_ft_shex = Seq2SeqTrainer(
    model=model_ft_shex,
    args=training_args_ft_shex,
    train_dataset=train_tok_shex,
    eval_dataset=valid_tok_shex,
    compute_metrics=compute_metrics_ex,
    data_collator=data_collator_ft_shex,
    tokenizer=tokenizer,
)

In [40]:
trainer_ft_shex.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].




Step,Training Loss,Validation Loss,Accuracy,Explanation Average Similarity
37504,1.9417,2.423472,0.849929,0.126843
42192,1.7083,2.548043,0.852063,0.126244


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=46875, training_loss=0.56545417578125, metrics={'train_runtime': 1333.7456, 'train_samples_per_second': 1124.652, 'train_steps_per_second': 35.145, 'total_flos': 2.283356607951667e+16, 'train_loss': 0.56545417578125, 'epoch': 3.0})

In [None]:
best_model_dir = directory_4 + '/best_model'
trainer_ft_shex.save_model(best_model_dir)

In [41]:
with open("results.txt", 'a') as file:
    file.write("Test: " + '\n')

In [42]:
trainer_ft_shex.evaluate(test_tok_shex)

{'eval_loss': 4.367889881134033,
 'eval_accuracy': 0.8480252442996743,
 'eval_explanation_average_similarity': 0.09273722767829895,
 'eval_runtime': 96.0412,
 'eval_samples_per_second': 102.289,
 'eval_steps_per_second': 3.197,
 'epoch': 3.0}

## 2.5 Task 5: Profiling-UD

In [39]:
if EXPLANATION_FIRST:
    directory_5 = "task5b" + MODEL
else:
    directory_5 = "task5" + MODEL

### Read the results of the automatic annotation stage performed over explanations with Profilind-UD.

1. **Token ID**: The token's position in the sentence.
2. **Token**: The actual token text.
3. **Lemma**: The lemma or base form of the token.
4. Universal part-of-speech tag.
5. Language-specific part-of-speech tag (optional).
6. Miscellaneous (misc) field, which can contain additional annotations.
7. Head: The ID of the token's syntactic head.
8. Dependency relation: The type of syntactic relation between the token and its head.
9. Secondary dependencies or additional annotations.

In [40]:
from src.profiling import distill_explanations

train_file_path = "ex_files/explanations_train.conllu"
val_file_path = "ex_files/explanations_val.conllu"
test_file_path = "ex_files/explanations_test.conllu"

train_outfile = "ex_files/explanations_task5_train.txt"
val_outfile = "ex_files/explanations_task5_val.txt"
test_outfile = "ex_files/explanations_task5_test.txt"

distill_explanations(train_file_path, ["NOUN", "VERB"], train_outfile)
distill_explanations(val_file_path, ["NOUN", "VERB"], val_outfile)
distill_explanations(test_file_path, ["NOUN", "VERB"], test_outfile)


### Prepare the dataset with modified explanations

In [42]:
from src.preprocess import retrieve_explanations

In [43]:
modified_explanations = {
    'train': retrieve_explanations(train_outfile),  
    'validation': retrieve_explanations(val_outfile),
    'test': retrieve_explanations(test_outfile)
}

In [44]:
tokenize_mapping_train = partial(tokenize_function_ex, tokenizer=tokenizer, explanations = modified_explanations['train'], use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)
tokenize_mapping_val = partial(tokenize_function_ex, tokenizer=tokenizer, explanations = modified_explanations['validation'], use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)
tokenize_mapping_test = partial(tokenize_function_ex, tokenizer=tokenizer, explanations = modified_explanations['test'],use_mnli_format = USE_MNLI_PROMPT, explanation_first = EXPLANATION_FIRST)

tokenize_mappings = (tokenize_mapping_train, tokenize_mapping_val, tokenize_mapping_test)

train_tok_5, valid_tok_5, test_tok_5 = prepare_dataset(dataset, tokenize_mapping=tokenize_mappings, sizes=sizes)

Map: 100%|██████████| 500000/500000 [00:33<00:00, 15111.25 examples/s]
Map: 100%|██████████| 9842/9842 [00:00<00:00, 14679.50 examples/s]
Map: 100%|██████████| 9824/9824 [00:00<00:00, 13463.96 examples/s]


In [45]:
train_tok_5 = train_tok_5.remove_columns(["explanation_1", "explanation_2", "explanation_3"])
valid_tok_5 = valid_tok_5.remove_columns(["explanation_1", "explanation_2", "explanation_3"])
test_tok_5 = test_tok_5.remove_columns(["explanation_1", "explanation_2", "explanation_3"])

### Fine-tuning

In [46]:
with open("results.txt", 'a') as file:
    file.write('\n' + "Task 5 : Fine-tune with only names and verbs in explanations" + '\n')

In [47]:
NUM_EPOCHS = 9

In [48]:
model_ft_5 = T5ForConditionalGeneration.from_pretrained(MODEL)
data_collator_ft_5 = DataCollatorForSeq2Seq(tokenizer, model=model_ft_5)

In [49]:
training_args_ft_5 = Seq2SeqTrainingArguments(
    **standard_args,
    num_train_epochs=NUM_EPOCHS,
    output_dir=directory_5,
    generation_max_length=32,
)

In [50]:
trainer_ft_5 = Seq2SeqTrainer(
    model=model_ft_5,
    args=training_args_ft_5,
    train_dataset=train_tok_5,
    eval_dataset=valid_tok_5,
    compute_metrics=compute_metrics_ex,
    data_collator=data_collator_ft_5,
    tokenizer=tokenizer,
)

In [52]:
trainer_ft_5.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].




Step,Training Loss,Validation Loss


In [None]:
best_model_dir = directory_5 + '/best_model'
trainer_ft_5.save_model(best_model_dir)

In [None]:
with open("results.txt", 'a') as file:
    file.write("Test: " + '\n')

trainer_ft_5.evaluate(test_tok_5)