In [None]:
def pre_train(config):
    tokenizer = PreTrainedTokenizer.from_pretrained(config.tokenizer_dir)
    config = MyModelConfig()
    
    model = TexttoTextModel(config)
    
    dataset = get_dataset(config.train_file, split = 'train', tokenizer = tokenizer)
    
    # considering T5 is seq2seq model, so that seq2seqTrainer, seq2seqTrainingArguments and DataCollatorForSeq2Seq are used
    generation_config = GenerationConfig()
    generation_config.remove_invalid_values = True
    generation_config.eos_token_id = tokenizer.eos_token_id
    generation_config.pad_token_id = tokenizer.pad_token_id
    generation_config.decoder_start_token_id = tokenizer.pad_token_id
    generation_config.max_new_tokens = 320
    generation_config.num_beams = 1         # greedy search
    generation_config.do_sample = False     # greedy search
    
    training_args = Seq2SeqTrainingArguments(
        output_dir=config.output_dir,
        per_device_train_batch_size=config.batch_size_per_gpu,
        auto_find_batch_size=True,  # 防止OOM
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        learning_rate=config.learn_rate,
        logging_steps=config.logging_steps,
        num_train_epochs=config.epochs,
        optim="adafactor",
        report_to='tensorboard',
        log_level='info',
        save_steps=config.save_steps,
        save_total_limit=3,
        fp16=True if config.mixed_precision == 'fp16' else False,
        bf16=True if config.mixed_precision == 'bf16' else False,
        logging_first_step=True,
        warmup_steps=config.warmup_steps,
        seed=config.seed,
        generation_config=generation_config,
    )

    # step 6: init my collator,
    collator = DataCollatorForSeq2Seq(tokenizer, max_length=config.max_seq_len)
    empty_cuda_cahce = MyTrainerCallback()

    # Step 7: Define the Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
        data_collator=collator,
        callbacks=[empty_cuda_cahce],
    )

    # step 8: train
    trainer.train(
        # resume_from_checkpoint=True
    )

    #step 9: save log
    loss_log = pd.DataFrame(trainer.state.log_history)
    log_dir = './logs'
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    loss_log.to_csv(f"{log_dir}/pre_train_log_{time.strftime('%Y%m%d-%H%M')}.csv")

    # Step 10: Save the model
    trainer.save_model(config.output_dir)


if __name__ == '__main__':
    config = TrainConfig()
    pre_train(config)

In [4]:
! ls ./

config.py	  data_preprocessing  test.ipynb  utils.py
customized_model  pretrain	      tokenizer
