# Giving up and training with HuggingFace :(

# Initialize

In [1]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ffa72171930>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [2]:
# Constants
llama_og_path = "./models/llama-7b-huggingface/tmp"
llama_token_path = "./models/llama-7b-huggingface"
train_dataset = "./llama_datasets/grammar_dataset/" #gtrain_10k.csv"
test_dataset  = "./llama_datasets/grammar_dataset/" #grammar_validation.csv"

INPUT_START_LABEL = "[ISTART]"
INPUT_END_LABEL = "[IEND]"
OUTPUT_START_LABEL = "[OSTART]"
OUTPUT_END_LABEL = '[OEND]'

# Data Initialize

In [3]:
train_dataset = load_dataset(train_dataset, split='train')
eval_dataset  = load_dataset(test_dataset, split='validation')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    llama_token_path,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [5]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [6]:
def generate_and_tokenize_prompt(data_point):
    target = data_point['input']
    result = data_point['target']
    
    full_prompt = f"{INPUT_START_LABEL}{target}{INPUT_END_LABEL}{OUTPUT_START_LABEL}{result}{OUTPUT_END_LABEL}"
    return tokenize(full_prompt)

In [7]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/2988 [00:00<?, ? examples/s]

# Prep Model For Training

In [8]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(llama_og_path, quantization_config=bnb_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [10]:
# Re-init the tokenizer so it doesn't add padding or eos token
eval_prompt = "Why is michigan better than  ohio state"
eval_tokenizer = AutoTokenizer.from_pretrained(
    llama_token_path,
    add_bos_token=True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Why is michigan better than  ohio state?
 nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows.
Why is michigan better than  ohio state?
Nobody knows


In [11]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )



In [18]:
config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)


trainable params: 10138624 || all params: 3510551552 || trainable%: 0.28880430467468604


In [19]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [20]:
import transformers
from datetime import datetime

project = "grammar"
base_model_name = "llama2"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=20,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
50,2.2719,1.592156




KeyboardInterrupt: 