In [1]:
import numpy as np
import random
import torch

seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed) 
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [2]:
from datasets import load_dataset

# Load the e-SNLI dataset
dataset = load_dataset("esnli")

train_dataset = dataset['train']
eval_dataset = dataset['validation']
#test_dataset = dataset['test']

indices = list(range(0, len(train_dataset), 10))  # Select every 10th index
train_dataset = train_dataset.select(indices)

len(train_dataset), len(eval_dataset)#, len(test_dataset)

Reusing dataset esnli (/home/ec2-user/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc)


  0%|          | 0/3 [00:00<?, ?it/s]

(54937, 9842)

In [3]:
label_dct = {0: "entailment", 1: "neutral", 2: "contradiction"}

In [4]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

# Preprocessing function
def preprocess(example):
    # Prepare input and output text
    input_text = f"Premise: {example['premise']} Hypothesis: {example['hypothesis']} What is the relationship? Explain your answer."
    output_text = f"{label_dct[example['label']]}: {example['explanation_1']}. {example['explanation_2']}. {example['explanation_3']}."

    # Tokenize input and output
    input_encoding = tokenizer(input_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    output_encoding = tokenizer(output_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    # Create a dictionary to return
    return {
        "input_ids": input_encoding["input_ids"][0],  # Remove batch dimension
        "attention_mask": input_encoding["attention_mask"][0],  # Remove batch dimension
        "labels": output_encoding["input_ids"][0] # Remove batch dimension
    }


# Apply preprocessing
train_dataset = train_dataset.map(
    preprocess,
    remove_columns=['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2', 'explanation_3'],
)
eval_dataset = eval_dataset.map(
    preprocess,
    remove_columns=['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2', 'explanation_3'],
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/54937 [00:00<?, ?ex/s]

  0%|          | 0/9842 [00:00<?, ?ex/s]

In [5]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(train_dataset[0]['input_ids'].shape)  # Should show (512,)
print(train_dataset[0]['attention_mask'].shape)  # Should show (512,)
print(train_dataset[0]['labels'].shape)  # Should show (512,)

torch.Size([512])
torch.Size([512])
torch.Size([512])


In [6]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").cuda()

In [7]:
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    AdamW,
    get_scheduler,
)

# Define custom optimizer
learning_rate = 0.001
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    betas=(0.9, 0.999),
    eps=1e-08,
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan_t5_esnli",
    evaluation_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",  # Save at the end of every epoch
    per_device_train_batch_size=8,  # Train batch size
    per_device_eval_batch_size=8,  # Evaluation batch size
    num_train_epochs=12,  # Number of epochs
    learning_rate=learning_rate,  # Learning rate
    lr_scheduler_type="linear",  # Linear learning rate scheduler
    warmup_ratio=0.05,  # Warmup ratio
    weight_decay=0.01,  # Weight decay
    save_total_limit=12,  # Keep only the last 2 checkpoints
    fp16=torch.cuda.is_available(),  # Use FP16 if a GPU is available
    seed=seed,
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss",  # Optimize for loss
    greater_is_better=False,
    report_to=[],
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
)



In [8]:
# Start training
trainer.train()

# Save the final model
model.save_pretrained("./final_flan_t5_esnli")
tokenizer.save_pretrained("./final_flan_t5_esnli")

# Evaluate on the test set
test_results = trainer.evaluate(eval_dataset=eval_dataset)
print("Test results:", test_results)



Epoch,Training Loss,Validation Loss
1,0.0534,0.365858
2,0.0478,0.355463
3,0.0433,0.366262
4,0.0402,0.36922
5,0.0374,0.384117
6,0.0353,0.387077
7,0.0332,0.392979
8,0.0312,0.396413
9,0.0294,0.412934
10,0.0277,0.413032


Checkpoint destination directory ./flan_t5_esnli/checkpoint-3434 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./flan_t5_esnli/checkpoint-6868 already exists and is non-empty.Saving will proceed but saved results may be invalid.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Test results: {'eval_loss': 0.355462908744812, 'eval_runtime': 206.8785, 'eval_samples_per_second': 47.574, 'eval_steps_per_second': 1.489, 'epoch': 12.0}


In [9]:
foutputs = []
flabels = []
for i in range(9842):
    finputs = eval_dataset[i]
    # Get the model's output
    with torch.no_grad():
        outputs = model.generate(finputs['input_ids'].cuda().unsqueeze(0), attention_mask=finputs['attention_mask'].cuda().unsqueeze(0))
    # Decode the output (convert token IDs back to text)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Print the result
    #print(f"Generated output: {decoded_output}")
    foutputs.append(decoded_output)
    flabels.append(dataset['validation'][i]['label'])
    if i%1000==0:
        print(i)
len(foutputs), len(flabels)



0
1000
2000
3000
4000
5000
6000
7000
8000
9000


(9842, 9842)

In [10]:
preds = []
dct = {"entailment": 0, "neutral": 1, "contradiction": 2}
for x in foutputs:
    preds.append(dct[x.split(':')[0]])
len(preds)

9842

In [11]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Calculate accuracy
accuracy = accuracy_score(flabels, preds)

# Calculate precision, recall, and F1-score for each class (macro-average)
precision = precision_score(flabels, preds, average='macro')
recall = recall_score(flabels, preds, average='macro')
f1 = f1_score(flabels, preds, average='macro')

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision:.4f}")
print(f"Recall (Macro): {recall:.4f}")
print(f"F1-score (Macro): {f1:.4f}")

Accuracy: 0.8238
Precision (Macro): 0.8243
Recall (Macro): 0.8235
F1-score (Macro): 0.8237
