## Continuous Pre-training using LLM recipie

In [None]:
! pip install datasets transformers torch tensorboard

In [2]:
# ====================================================IMPORT=================================================================

from datasets import DatasetDict, load_dataset, load_from_disk
import os
import warnings
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
import time
from torch.utils.tensorboard import SummaryWriter

warnings.filterwarnings("ignore")
os.environ.update(
    {
        # "NCCL_P2P_DISABLE": "1",
        # "NCCL_IB_DISABLE": "1",
        # "TOKENIZERS_PARALLELISM": "false",
        # "CUDA_VISIBLE_DEVICES": "3,2,1",
    }
)

from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForLanguageModeling,
)
import torch
from torch.utils.data import Dataset
import pandas as pd

In [3]:
sample_number = 10000  # Number of samples to be used for training and evaluation
learning_rate = 1e-6  # Learning rate for the optimizer
per_device_train_batch_size = 10  # Batch size for training per device
per_device_eval_batch_size = 10  # Batch size for evaluation per device
num_train_epochs = 5  # Number of epochs for training
weight_decay = 0.01  # Weight decay rate for regularization
MAX_LEN = 512  # Maximum sequence length for model inputs
warmup_ratio = 0.5
resume_from_checkpoint = False

model_name = (
    # "/home/llama/Personal_Directories/srb/binary_classfication/Llama-3.2-3B-Instruct"
    "/home/snt/llm_models/Llama-3.2-1B-Instruct"
)
val_dataset_path = (
    "data/fake_targets/flores_devtest_arrow"
)
train_dataset_path = "data/processed/dataset_merged_llama_fake_targets.jsonl"

In [7]:
if train_dataset_path.endswith(".jsonl"):
    train_dataset_df = pd.read_json(train_dataset_path, lines=True)
else:
    train_dataset_df = load_from_disk(train_dataset_path)

train_dataset = (
    train_dataset_df[["input", "translated_text"]]
    .rename(columns={
        "input": "Luxembourgish",
        "translated_text": "English",
    })
    .iloc[:sample_number]
)

val_dataset = (
    load_from_disk(val_dataset_path)
    .rename_columns(
        {
            "sentence_ltz_Latn": "Luxembourgish",
            "sentence_eng_Latn": "English",
        }
    )
    .select([i for i in range(100)])
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


def create_prompt(
    sample, mode="train", src_lng="Luxembourgish", tgt_lng="English", tokenizer=None
):
    """
    Create a prompt using the model's EOS token.

    Args:
        sample (dict): A dictionary containing source and target text.
        mode (str): The mode, either 'train' or 'test'.
        src_lng (str): Source language name.
        tgt_lng (str): Target language name.
        tokenizer: The tokenizer associated with the model (required to fetch EOS token).

    Returns:
        dict: A dictionary with the constructed prompt.
    """
    # Validate the tokenizer input
    if tokenizer is None or tokenizer.eos_token is None:
        raise ValueError("A tokenizer with a defined EOS token is required.")

    # Define the system message template.
    system_message = f"Translate the {src_lng} input text into {tgt_lng}.".upper()
    input_text = sample[src_lng.capitalize()].strip()  # Extract the input text.
    response = (
        sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else ""
    )  # Extract the target text.

    # Get the EOS token from the tokenizer.
    eos_token = tokenizer.eos_token

    # Construct the full prompt.
    full_prompt = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        + system_message
        + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
    )
    full_prompt += (
        input_text + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    )
    if mode == "train":
        full_prompt += response + eos_token
    return {"prompt_response": full_prompt}


train_dataset = train_dataset.map(
    lambda sample: {
        "prompt_response": create_prompt(sample, mode="train", tokenizer=tokenizer)[
            "prompt_response"
        ]
    }
).select_columns(["prompt_response"])

val_dataset = val_dataset.map(
    lambda sample: {
        "prompt_response": create_prompt(sample, mode="train", tokenizer=tokenizer)[
            "prompt_response"
        ]
    }
).select_columns(["prompt_response"])


dataset = DatasetDict({"train": train_dataset, "val": val_dataset})
data_collator = DataCollatorForLanguageModeling(
    tokenizer, mlm=False, return_tensors="pt"
)


def tokenize_function(examples):
    return tokenizer(
        examples["prompt_response"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )


tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, remove_columns=["prompt_response"]
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function, batched=True, remove_columns=["prompt_response"]
)

# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     tokenized_train_dataset, batch_size=5, collate_fn=data_collator
# )

# for i in train_dataloader:
#     for j in tokenizer.batch_decode(i["input_ids"]):
#         print(j)

# val_dataloader = DataLoader(
#     tokenized_val_dataset, batch_size=5, collate_fn=data_collator
# )

FileNotFoundError: Directory /home/snt/projects_lujun/mt_luxembourgish/data/fake_targets/flores_devtest_arrow not found

In [10]:
# ====================================================TRAINING=================================================================
def print_trainable_parameters(model):
    """Prints the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"

In [13]:
def train_ddp_accelerate():
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.config.use_cache = False
    current = time.time()
    output_dir = f"logs/fit_{current}"
    print(print_trainable_parameters(model))
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        warmup_ratio=warmup_ratio,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        # eval_steps=500,
        logging_steps=25,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=True,
        max_grad_norm=0.3,
        group_by_length=True,
        lr_scheduler_type="cosine",
        report_to="tensorboard",
        ddp_find_unused_parameters=False,
        remove_unused_columns=False,
        disable_tqdm=False,
        load_best_model_at_end=True,
    )
    writer = SummaryWriter(log_dir=output_dir)
    # 定义 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # print("Starting evaluation before training...")
    # initial_eval_results = trainer.evaluate()
    # print(f"Initial evaluation results: {initial_eval_results}")

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    # 保存模型
    model.save_pretrained(f"{output_dir}/llama2-parallel-trained")
    tokenizer.save_pretrained(f"{output_dir}/llama2-parallel-trained")

In [14]:
from accelerate import Accelerator


def main():
    accelerator = Accelerator()
    train_ddp_accelerate()


if __name__ == "__main__":
    main()

trainable params: 1235814400 || all params: 1235814400 || trainable%: 100.0


Epoch,Training Loss,Validation Loss
1,2.2555,2.479201
2,1.7344,2.126006
3,1.4445,2.107605
4,1.1748,2.178716
5,1.1215,2.23527


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
