In [1]:
import os

import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

device="cuda"
model = AutoModelForCausalLM.from_pretrained("/workspace/model/bloomz-3b", load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained("/workspace/model/bloomz-3b")

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /usr/local/conda/lib/python3.10/site-packages/bitsandbytes-0.37.2-py3.10.egg/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)


In [2]:
from peft import prepare_model_for_int8_training
model = prepare_model_for_int8_training(model)



In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, lora_alpha=32, target_modules=["query_key_value"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model = model.to(device)
print_trainable_parameters(model)

trainable params: 4915200 || all params: 3007472640 || trainable%: 0.1634329082375293


In [4]:
import transformers
from datasets import load_dataset

# data = load_dataset("Abirate/english_quotes")

data = load_dataset("/workspace/data/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)




In [5]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
        print("checkpoint folder: ",checkpoint_folder)
        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        
        files = os.listdir(checkpoint_folder)
        print("checkpoint folder list: ", files)
        adapter_files = os.listdir(peft_model_path)
        print("checkpoint adapter folder list: ", adapter_files)
        
        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control

args = transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        save_strategy = 'steps',
        save_steps = 10
    )

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[SavePeftModelCallback()],
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.2114
2,3.3737
3,3.1703
4,3.2821
5,3.1871
6,3.1968
7,2.9529
8,3.2128
9,3.1687
10,3.1167


checkpoint folder:  outputs/checkpoint-10
checkpoint folder list:  ['README.md', 'adapter_config.json', 'adapter_model', 'adapter_model.safetensors', 'optimizer.pt', 'rng_state.pth', 'scheduler.pt', 'trainer_state.json', 'training_args.bin']
checkpoint adapter folder list:  ['README.md', 'adapter_config.json', 'adapter_model.bin']




checkpoint folder:  outputs/checkpoint-20
checkpoint folder list:  ['README.md', 'adapter_config.json', 'adapter_model', 'adapter_model.safetensors', 'optimizer.pt', 'rng_state.pth', 'scheduler.pt', 'trainer_state.json', 'training_args.bin']
checkpoint adapter folder list:  ['README.md', 'adapter_config.json', 'adapter_model.bin']


TrainOutput(global_step=20, training_loss=3.098853278160095, metrics={'train_runtime': 122.735, 'train_samples_per_second': 10.429, 'train_steps_per_second': 0.163, 'total_flos': 2954973215784960.0, 'train_loss': 3.098853278160095, 'epoch': 0.51})

In [6]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "outputs/checkpoint-20/"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

batch = tokenizer("Two things are infinite: ", return_tensors="pt")

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print("output：\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=True))



output：

 Two things are infinite:  the universe and the number of ways you can screw up.
