In [3]:
from utils.data import create_train_valid_whatsapp
from transformers import (
    AutoModelForSeq2SeqLM
    , DataCollatorForSeq2Seq
    , AutoTokenizer
    , Seq2SeqTrainer
    , Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType



MODEL_ID = 't5-small'

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)



model = get_peft_model(
    AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, device_map='auto')
    , lora_config
)

model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# we need to get the maxium input and output token lengths 

def tokenize_data(data_set, tokenizer):
    model_inputs = tokenizer(
        data_set['text']
        , truncation=True
        , padding=True
    )
    
    labels = tokenizer(
        data_set['text']
        , truncation=True
        , padding=True
    )
    
    model_inputs['labels'] = labels['input_ids']
    
    return model_inputs


data = create_train_valid_whatsapp(
    path='./raw-data/karamh chat.txt'
    , train_sender_name='Conall'
    , prompt_max_length=tokenizer.model_max_length
)



tokenized_train = data['train'].map(lambda data: tokenize_data(data, tokenizer=tokenizer), batched=False, remove_columns=['text', 'label'])
tokenized_test = data['test_data'].map(lambda data: tokenize_data(data, tokenizer=tokenizer), batched=False, remove_columns=['text', 'label'])



trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9653981848502878


Map: 100%|██████████| 1112/1112 [01:04<00:00, 17.26 examples/s]
Map: 100%|██████████| 279/279 [00:16<00:00, 17.14 examples/s]


TypeError: DataCollatorForSeq2Seq.__init__() got an unexpected keyword argument 'truncation'

In [4]:



data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer
    , model=model
    , label_pad_token_id=tokenizer.pad_token_id
    , pad_to_multiple_of=8
    , padding=True
)

output_dir='test'

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
    use_cpu=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)



In [5]:
trainer.train()

  2%|▏         | 16/695 [10:08<7:10:31, 38.04s/it]
  0%|          | 0/695 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  6%|▋         | 45/695 [16:57<4:07:17, 22.83s/it]