In [1]:
# import torch._dynamo
# torch._dynamo.config.suppress_errors = True

In [2]:
import logging
import sys

import torch
import transformers
import datasets
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, get_peft_model_state_dict
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaTokenizerFast

In [3]:
BASE_MODEL = '/mnt/nfs/zsd_server/models/huggingface/llama-7b-hf_yahma'
DATA_PATH = '/mnt/nfs/zsd_server/data/origin/alpaca_data.json'
SAVE_PATH = '/mnt/nfs/zsd_server/models/my/llama-7b_save'

In [4]:
class ARGS:
    max_length = 512
    micro_batch_size = 1
    gradient_accumulation_steps = 64
    batch_size = micro_batch_size * gradient_accumulation_steps
    learning_rate = 3e-4
    epoch = 1
    train_steps = 52000 // batch_size * epoch
    lora_r = 16
    lora_alpha = 32
    lora_dropout = 0.05
    lora_target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ]

In [5]:
logger = logging.getLogger(__name__)

In [6]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()

log_level = logging.INFO
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()


In [7]:
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    # load_in_8bit=True,
    # torch_dtype=torch.float16,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    # device_map="cpu",
    device_map="cuda",
)

[INFO|configuration_utils.py:713] 2023-12-16 01:28:55,004 >> loading configuration file /mnt/nfs/zsd_server/models/huggingface/llama-7b-hf_yahma/config.json
[INFO|configuration_utils.py:775] 2023-12-16 01:28:55,005 >> Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.32.1",
  "use_cache": true,
  "vocab_size": 32000
}

[INFO|modeling_utils.py:2776] 2023-12-16 01:28:55,006 >> loading weights file /mnt/nfs/zsd_server/models/huggingface/llama-7b-hf_yahma/pytorch_model.bin.index.json
[INFO|modeling

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:3551] 2023-12-16 01:28:59,775 >> All model checkpoint weights were used when initializing LlamaForCausalLM.

[INFO|modeling_utils.py:3559] 2023-12-16 01:28:59,776 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at /mnt/nfs/zsd_server/models/huggingface/llama-7b-hf_yahma.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
[INFO|configuration_utils.py:728] 2023-12-16 01:28:59,778 >> loading configuration file /mnt/nfs/zsd_server/models/huggingface/llama-7b-hf_yahma/generation_config.json
[INFO|configuration_utils.py:768] 2023-12-16 01:28:59,778 >> Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 0,
  "transformers_version": "4.32.1"
}



In [8]:
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
# tokenizer = LlamaTokenizerFast.from_pretrained(BASE_MODEL, use_fast=False)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
# tokenizer.bos_token_id = 1  # <s>
# tokenizer.eos_token_id = 2  # </s>
# print(tokenizer)
print(f"tokenizer.pad_token_id:{tokenizer.pad_token_id} tokenizer.pad_token:{tokenizer.pad_token}")
print(f"tokenizer.bos_token_id:{tokenizer.bos_token_id} tokenizer.bos_token:{tokenizer.bos_token}")
print(f"tokenizer.eos_token_id:{tokenizer.eos_token_id} tokenizer.eos_token:{tokenizer.eos_token}")

[INFO|tokenization_utils_base.py:1850] 2023-12-16 01:28:59,788 >> loading file tokenizer.model
[INFO|tokenization_utils_base.py:1850] 2023-12-16 01:28:59,788 >> loading file added_tokens.json
[INFO|tokenization_utils_base.py:1850] 2023-12-16 01:28:59,788 >> loading file special_tokens_map.json
[INFO|tokenization_utils_base.py:1850] 2023-12-16 01:28:59,788 >> loading file tokenizer_config.json


tokenizer.pad_token_id:0 tokenizer.pad_token:<unk>
tokenizer.bos_token_id:1 tokenizer.bos_token:<s>
tokenizer.eos_token_id:2 tokenizer.eos_token:</s>


In [9]:
data = load_dataset("json", data_files=DATA_PATH)
# print(data)
print(data['train'])

Using custom data configuration default-3b046fe65c5aa960


12/16/2023 01:29:00 - INFO - datasets.builder - Using custom data configuration default-3b046fe65c5aa960


Loading Dataset Infos from /home/zsd/miniconda3/envs/huggingface/lib/python3.11/site-packages/datasets/packaged_modules/json


12/16/2023 01:29:00 - INFO - datasets.info - Loading Dataset Infos from /home/zsd/miniconda3/envs/huggingface/lib/python3.11/site-packages/datasets/packaged_modules/json


Overwrite dataset info from restored data version if exists.


12/16/2023 01:29:00 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96


12/16/2023 01:29:00 - INFO - datasets.info - Loading Dataset info from /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96


Found cached dataset json (/home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


12/16/2023 01:29:00 - INFO - datasets.builder - Found cached dataset json (/home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


Loading Dataset info from /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96


12/16/2023 01:29:00 - INFO - datasets.info - Loading Dataset info from /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
Dataset({
    features: ['instruction', 'output', 'input'],
    num_rows: 52002
})


In [10]:
def generate_prompt(data_point):
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{data_point['instruction']}"""

    if data_point['input']:
        prompt += f"""\n\n### Input:\n{data_point['input']}"""

    prompt += f"""\n\n### Response:\n{data_point['output']}"""

    return prompt

In [11]:
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=ARGS.max_length,
        padding=False,
        return_tensors=None,
    )
    if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < ARGS.max_length
            and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

In [12]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [13]:
a = generate_and_tokenize_prompt({'instruction': 'hello', 'input': '', 'output': 'Hello.'})
print(a)
print(type(a['input_ids']))
print(tokenizer.decode(a['input_ids']))

{'input_ids': [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29892, 3300, 2859, 411, 385, 1881, 393, 8128, 4340, 3030, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 12199, 13, 13, 2277, 29937, 13291, 29901, 13, 10994, 29889, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29892, 3300, 2859, 411, 385, 1881, 393, 8128, 4340, 3030, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 12199, 13, 13, 2277, 29937, 13291, 29901, 13, 10994, 29889, 2]}
<class 'list'>
<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
hello

### Response:
Hello.</s>


In [14]:
data = data["train"].train_test_split(test_size=200, shuffle=True, seed=42)
data["train"] = data["train"].map(generate_and_tokenize_prompt, remove_columns=data["train"].column_names)
data["test"] = data["test"].map(generate_and_tokenize_prompt, remove_columns=data["test"].column_names)
print(data)

Loading cached split indices for dataset at /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-4ca01b4a35da0554.arrow and /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-8880a9c7aab019ed.arrow


12/16/2023 01:29:00 - INFO - datasets.arrow_dataset - Loading cached split indices for dataset at /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-4ca01b4a35da0554.arrow and /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-8880a9c7aab019ed.arrow


Loading cached processed dataset at /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c3bcce630c169df7.arrow


12/16/2023 01:29:00 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c3bcce630c169df7.arrow


Loading cached processed dataset at /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-b5e56414dbfe2789.arrow


12/16/2023 01:29:00 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/zsd/.cache/huggingface/datasets/json/default-3b046fe65c5aa960/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-b5e56414dbfe2789.arrow
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 51802
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})


In [15]:
config = LoraConfig(
    r=ARGS.lora_r,
    lora_alpha=ARGS.lora_alpha,
    target_modules=ARGS.lora_target_modules,
    lora_dropout=ARGS.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.589770503135875


In [16]:
# model = torch.compile(model)

In [17]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=ARGS.micro_batch_size,
    gradient_accumulation_steps=ARGS.gradient_accumulation_steps,
    # warmup_steps=100,
    warmup_ratio=0.1,
    max_steps=ARGS.train_steps,
    learning_rate=ARGS.learning_rate,
    # fp16=True,
    bf16=True,
    optim="adamw_torch",
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    output_dir=SAVE_PATH,
    save_total_limit=3,
    load_best_model_at_end=True,
    # report_to="tensorboard",
    logging_dir='logs',
    logging_steps=1,
    auto_find_batch_size=False,
    # torch_compile=True,
    do_train=True,
)
# training_arguments

[INFO|training_args.py:1327] 2023-12-16 01:29:01,049 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1769] 2023-12-16 01:29:01,049 >> PyTorch: setting up devices
[INFO|training_args.py:1480] 2023-12-16 01:29:01,049 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, 
    return_tensors="pt", 
    padding=True,
    pad_to_multiple_of=8, 
    # pad_to_multiple_of=ARGS.max_length, 
)

In [19]:
data_collator

DataCollatorForSeq2Seq(tokenizer=LlamaTokenizer(name_or_path='/mnt/nfs/zsd_server/models/huggingface/llama-7b-hf_yahma', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False), model=None, padding=True, max_length=None, pad_to_multiple_of=8, label_pad_token_id=-100, return_tensors='pt')

In [20]:
batch = data_collator([data["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [21]:
# batch['input_ids'].shape

In [22]:
# batch['attention_mask'].shape

In [23]:
# batch['labels'].shape

In [24]:
# trainer = transformers.Trainer(
#     model=model,
#     train_dataset=data["train"],
#     eval_dataset=data["test"],
#     args=training_arguments,
#     data_collator=data_collator
# )

In [25]:
model.config.use_cache = False

# old_state_dict = model.state_dict
# model.state_dict = (
#     lambda self, *_, **__: get_peft_model_state_dict(
#         self, old_state_dict()
#     )
# ).__get__(model, type(model))

In [26]:
# model = torch.compile(model)

In [27]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    args=training_arguments,
    data_collator=data_collator
)

[INFO|trainer.py:393] 2023-12-16 01:29:01,091 >> You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
[INFO|trainer.py:565] 2023-12-16 01:29:01,092 >> max_steps is given, it will override any value given in num_train_epochs


In [28]:
# print(model)

In [None]:
trainer.train()

[INFO|trainer.py:1714] 2023-12-16 01:29:01,164 >> ***** Running training *****
[INFO|trainer.py:1715] 2023-12-16 01:29:01,165 >>   Num examples = 51,802
[INFO|trainer.py:1716] 2023-12-16 01:29:01,165 >>   Num Epochs = 2
[INFO|trainer.py:1717] 2023-12-16 01:29:01,165 >>   Instantaneous batch size per device = 1
[INFO|trainer.py:1720] 2023-12-16 01:29:01,165 >>   Total train batch size (w. parallel, distributed & accumulation) = 64
[INFO|trainer.py:1721] 2023-12-16 01:29:01,165 >>   Gradient Accumulation steps = 64
[INFO|trainer.py:1722] 2023-12-16 01:29:01,166 >>   Total optimization steps = 812
[INFO|trainer.py:1723] 2023-12-16 01:29:01,167 >>   Number of trainable parameters = 39,976,960


Step,Training Loss,Validation Loss
