## Arguments

### Model Arguments

In [1]:
model_name_or_path = 't5-small'

### Data Arguments

In [None]:
data_args = {
    'train_file': 'train.json',
    'validation_file': 'test.json',
    'max_target_length': 128,
    'max_source_length': 512,
    'ignore_pad_token_for_loss': True,
    }

### Training Arguments

In [None]:
training_args = {
    'model_name_or_path': model_name_or_path,
    'output_dir': './output',
    'predict_with_generate': False,
    'do_train': True,
    'do_eval': True,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'gradient_accumulation_steps': 2,
    'learning_rate': 5e-4,
    'evaluation_strategy': 'steps',
    'num_train_epochs': 10,
    'save_total_limit': 4,
    'save_strategy': 'epoch',
    'seed': 42
    }

## Setting up the seed

In [None]:
from transformers import set_seed

set_seed(training_args['seed'])

## Load Dataset

In [12]:
from datasets import load_dataset
from transformers import T5Tokenizer

# load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Load your dataset
dataset = load_dataset('json', data_files={'train': 'train.json', 'test': 'test.json'})

# Define a function for preprocessing
def preprocess_function(examples):
    processed_examples = {
        'input_ids': [],
        'attention_mask': [],
        'labels': []
    }

    for i, t in zip(examples['inputs'], examples['targets']):
        tokenized_inputs = tokenizer(i, max_length=512, truncation=True, padding='max_length')
        tokenized_targets = tokenizer(t, max_length=38, truncation=True, padding='max_length')
        
        if len(tokenized_inputs['input_ids']) <= 512:
            processed_examples['input_ids'].append(tokenized_inputs['input_ids'])
            processed_examples['attention_mask'].append(tokenized_inputs['attention_mask'])
            processed_examples['labels'].append(tokenized_targets['input_ids'])

    return processed_examples

# Apply the preprocessing function to the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

Found cached dataset json (C:/Users/digit/.cache/huggingface/datasets/json/default-474954548f6c3757/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/123655 [00:00<?, ? examples/s]

Map:   0%|          | 0/77891 [00:00<?, ? examples/s]

In [13]:
import numpy as np
all_tokenized_inputs = [(tokenizer.encode(input)) for input in encoded_dataset['train']['inputs']]
longest_input = all_tokenized_inputs[np.argmax([len(input) for input in all_tokenized_inputs])]
print(f'Longest input: {longest_input}')
#Print the actual input
print(f'Actual input: {tokenizer.decode(longest_input)}')

Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors


Longest input: [21972, 8, 9387, 3, 16042, 7, 3615, 12, 2, 96, 23728, 1265, 52, 40, 121, 10, 121, 5948, 7, 10, 2, 3, 87, 2, 3, 87, 1986, 5, 17193, 4067, 5, 287, 2, 3, 87, 8221, 7, 2, 3, 14785, 2, 3, 20376, 2, 3, 87, 196, 6399, 7, 210, 7059, 4554, 106, 5411, 102, 1725, 121, 976, 12911, 1265, 52, 40, 121, 10, 121, 5948, 7, 10, 2, 3, 87, 2, 3, 87, 1986, 5, 17193, 4067, 5, 287, 2, 3, 87, 8221, 7, 2, 3, 14785, 2, 3, 20376, 2, 3, 87, 196, 6399, 7, 210, 7059, 4554, 106, 5411, 102, 1725, 121, 976, 23728, 518, 23, 26, 189, 121, 10, 25991, 976, 23728, 3845, 2632, 121, 10, 5426, 976, 12911, 518, 23, 26, 189, 121, 10, 28212, 976, 12911, 3845, 2632, 121, 10, 3647, 25134, 5373, 2517, 3288, 2368, 6348, 2469, 976, 2176, 35, 7, 53, 121, 10, 121, 2, 3, 8481, 853, 2423, 2, 96, 51, 210, 18, 1893, 7, 49, 18, 670, 2562, 2, 96, 3155, 2, 3, 102, 3155, 196, 8307, 48, 18721, 13, 46, 3146, 2, 3, 29, 2, 3, 87, 102, 3155, 2, 3, 102, 3155, 434, 447, 5167, 10, 2, 3, 9, 8318, 2423, 2, 96, 29, 32, 25278, 2, 96, 853, 24

In [8]:
print(f'Actual input: {tokenizer.decode(longest_input)}')

Actual input: Slide the Block Messages switch to<unk> "smallUrl":"https:<unk> /<unk> /www.wikihow.com<unk> /images<unk> /1<unk> /15<unk> /Iphoneswitchonicon1.png","bigUrl":"https:<unk> /<unk> /www.wikihow.com<unk> /images<unk> /1<unk> /15<unk> /Iphoneswitchonicon1.png","smallWidth":460,"smallHeight":300,"bigWidth":760,"bigHeight":495.6521739130435,"licensing":"<unk> div class=<unk> "mw-parser-output<unk> "><unk> p>I edited this screenshot of an iPhone<unk> n<unk> /p><unk> p>License:<unk> a rel=<unk> "nofollow<unk> " class=<unk> "external text<unk> " href=<unk> "http:<unk> /<unk> /en.wikipedia.org<unk> /wiki<unk> /Fair_use<unk> ">Fair Use<unk> /a><unk> br><unk> n<unk> /p><unk> /div>"<unk> or<unk> "smallUrl":"https:<unk> /<unk> /www.wikihow.com<unk> /images<unk> /2<unk> /28<unk> /Android7switchon.png","bigUrl":"https:<unk> /<unk> /www.wikihow.com<unk> /images<unk> /2<unk> /28<unk> /Android7switchon.png","smallWidth":460,"smallHeight":394,"bigWidth":760,"bigHeight":651.4285714285714,"lice

In [3]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

# Load the model
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the more memory used)
    prediction_loss_only=True, # If I should only return the loss
    learning_rate=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
)

# Create the Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
)

trainer.train()



  0%|          | 0/23187 [00:00<?, ?it/s]

ValueError: expected sequence of length 35 at dim 1 (got 277)

## Load pretrained model and tokenizer

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM

config = AutoConfig.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    config = config,
    from_tf=bool('.ckpt' in model_name_or_path)
)

## Tokenize the inputs and targets

In [None]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['inputs']]
    targets = [ex for ex in examples['targets']]
    model_inputs = tokenizer(inputs, max_length= data_args['max_source_length'], padding = False, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length = data_args['max_target_length'], padding = False, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
train_dataset, eval_dataset = datasets['train'], datasets['validation']
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=column_names,
    load_from_cache_file=True,
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=column_names,
    load_from_cache_file=True,
)

### Data Collator

In [None]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100 if data_args['ignore_pad_token_for_loss'] else tokenizer.pad_token_id

data_collator = DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id)

### Initialise Trainer

In [None]:
from transfomers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,
)

## Training

In [None]:
import os

train_result = trainer.train(model_path=model_name_or_path if os.path.isdir(model_name_or_path) else None)
trainer.save_model()

output_train_file = os.path.join(training_args['output_dir'], 'train_results.txt')
if trainer.is_world_process_zero():
    with open(output_train_file, 'w') as writer:
        for key, value in sorted(train_result.metrics.items()):
            writer.write(f'{key} = {value}\n')

    # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
    trainer.state.save_to_json(os.path.join(training_args['output_dir'], 'trainer_state.json'))

## Evaluation

In [None]:
import json

def evaluate_predictions(pred_filename, gold_filename):
    with open(pred_filename, 'r') as pred_f, open(gold_filename) as gold_f:
        pred_lines = pred_f.readlines()
        gold_lines = gold_f.readlines()
    
        total = 0.0
        full_correct = 0.0
        first_correct = 0.0
        
        for i in range(len(pred_lines)):
            pred_line = pred_lines[i].strip()
            if gold_filename.endswith('.json'):
                gold_json = json.loads(gold_lines[i])
                gold_line = gold_json['translation']['tgt']
            else:  
                gold_line = gold_lines[i].strip().split('\t')[1]
            
            # remove space before period/question mark
            gold_line = gold_line.replace(' ?', '?').replace(' .', '.').replace(' ,', ',') 

            total +=1

            if pred_line == gold_line:
                full_correct += 1
                first_correct += 1
            else:
                pred_words = pred_line.split()
                gold_words = gold_line.split()
                if len(pred_words) > 0 and pred_words[0] == gold_words[0]:
                    first_correct += 1

  
    return  (first_correct / total), (full_correct / total)

In [None]:
basename = os.path.basename(data_args['validation_file']).replace('.json', '')

predictions = trainer.predict(test_dataset=eval_dataset, max_length=100)
output_pred_file = os.path.join(training_args['output_dir'], basename + '.eval_preds_seq2seq.txt')
if trainer.is_world_process_zero():
    with open(output_pred_file, 'w') as writer:
        for pred in tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True):
            writer.write(pred + '\n')

output_eval_file = os.path.join(training_args['output_dir'], basename + '.eval_results_seq2seq.txt')
first_acc, full_acc = evaluate_predictions(output_pred_file, data_args['validation_file'])
if trainer.is_world_process_zero():
    with open(output_eval_file, 'w') as writer:
        writer.write(f'Exact match accuracy: {full_acc}\n')
        writer.write(f'First word accuracy: {first_acc}\n')