In [2]:
#Code origin
#Author: Alexander Valentini
#The preference data is transformed into the huggingface DPO format - and all pairs where the prompt is longer than 900 tokens
#or one of the answers are longer than 1024 tokens are removed. 


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorWithPadding#, BitsAndBytesConfig 
from datasets import load_from_disk, load_dataset, Dataset
import numpy as np
#Using quantization, but this should probably not be in the final version:
from pathlib import Path
from typing import Dict, Optional

from trl import DPOTrainer

from peft import LoraConfig, AutoPeftModelForCausalLM


#bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    llm_int8_threshold=6.0,
#    llm_int8_has_fp16_weight=False,
#    bnb_4bit_compute_dtype=torch.bfloat16,
#    bnb_4bit_use_double_quant=True,
#    bnb_4bit_quant_type="nf4",
#)

model_name = 'stabilityai/stablelm-zephyr-3b'
safetensors_path = 'AlexVal/sft-model'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
model = AutoModelForCausalLM.from_pretrained(safetensors_path, attn_implementation="sdpa",
    torch_dtype=torch.bfloat16, use_safetensors = True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="sdpa",
#    torch_dtype=torch.bfloat16, use_safetensors = True, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model

StableLmForCausalLM(
  (model): StableLmModel(
    (embed_tokens): Embedding(50278, 2560)
    (layers): ModuleList(
      (0-31): 32 x StableLmDecoderLayer(
        (self_attn): StableLmSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): StableLmRotaryEmbedding()
        )
        (mlp): StableLmMLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affin

In [4]:
prompt = [{'role': 'user', 'content': 'List 3 synonyms for the word "tiny"'}]
inputs = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt=True,
    return_tensors='pt'
)

tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=20,
    temperature=0.8,
    do_sample=True
)

print(tokenizer.decode(tokens[0], skip_special_tokens=False))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


<|user|>
List 3 synonyms for the word "tiny"<|endoftext|>
<|assistant|>
1.
2.
3.<|endoftext|>


In [5]:
device

device(type='cuda')

In [18]:
#Read in training and evaluation data:
import os

project_dir = os.path.dirname(os.path.abspath(os.getcwd()))
#data_dir = os.path.join(project_dir, 'datasets\dpo_preference_data_self_made')
#preference_data = load_from_disk(data_dir)
#train_dataset = preference_data['train']
#eval_dataset = preference_data['eval']

train_data_path = os.path.join(project_dir, 'datasets/dpo_preference_data_self_made/train_preference_data.jsonl')
vali_data_path = os.path.join(project_dir, 'datasets/dpo_preference_data_self_made/eval_preference_data.jsonl')

dataset=load_dataset('json', data_files={"train":train_data_path, "validation":vali_data_path})

train_dataset = load_dataset('json',data_files=train_data_path)
eval_dataset = load_dataset('json',data_files=vali_data_path)

#train_dataset
#print(f"Number of examples in the training set: {len(train_dataset)}")
#print(f"Number of examples in the evaluation set: {len(eval_dataset)}")

In [19]:
train_dataset['train'][0]

{'prompt': '<|user|>\nQuestion: Let $X$ denote the random variable associated to the plaintexts and $Y$ the random variable associated to the corresponding ciphertexts. If a cryptosystem achieves perfect secrecy, then we have that \\dots?\n\nOptions:\nA. $\\Pr [X=x|Y=y] = \\Pr[Y=y]$.\nB. $\\Pr [X=x|Y=y] = \\Pr[X=x]$.\nC. $\\Pr [X=x|Y=y] = \\Pr[X=x,Y=y]$.\nD. $\\Pr [X=x] = \\Pr[Y=y]$.<|endoftext|',
 'chosen': '<|assistant|>\nThe correct statement is: \\"$\\\\\\\\Pr [X=x|Y=y] = \\\\\\\\Pr[X=x]$.\\"\\n\\nPerfect secrecy in a cryptosystem means that observing the ciphertext does not give any information about the plaintext. Mathematically, this is formalized as $\\\\\\\\Pr [X=x|Y=y] = \\\\\\\\Pr[X=x]$, meaning that the probability of a certain plaintext being the true message given the ciphertext is the same as the probability of that plaintext being the true message without knowing the ciphertext.\\n\\nThe other statements are not necessarily true in the context of perfect secrecy. \\"$\\

In [21]:
#Setting up Lora:
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [22]:
#Training arguments for DPO:

args = TrainingArguments(
    output_dir="checkpoints/dpo_model_test",               # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=12,         # batch size per device during training
    per_device_eval_batch_size=4,           # batch size for evaluation
    gradient_accumulation_steps=1,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    learning_rate=5e-5,                     # 10x higher LR than QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.1,                       # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # use cosine learning rate scheduler
    logging_steps=25,                       # log every 25 steps
    save_steps=400,                         # when to save checkpoint
    save_total_limit=4,                     # limit the total amount of checkpoints
    evaluation_strategy="steps",            # evaluate every 1000 steps
    eval_steps=700,                         # when to evaluate
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)
 
dpo_args = {
    "beta": 0.1,                            # The beta factor in DPO loss. Higher beta means less divergence
    "loss_type": "sigmoid"                  # The loss type for DPO.
}



In [24]:
#Finding 99th percentile of data length for padding:
prompt_length = int(np.percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset['train']["prompt"]], 99))
rejected_length = int(np.percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset['train']["rejected"]], 99))
chosen_length = int(np.percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset['train']["chosen"]], 99))

print(prompt_length)
print(rejected_length)
print(chosen_length)






593
704
755


In [25]:
#This should be increased in later versions:
prompt_max_length = 900
max_seq_length = 1024

In [26]:
trainer = DPOTrainer(
    model,
    ref_model=None, # set to none since we use peft
    peft_config=peft_config,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=max_seq_length,
    max_prompt_length=prompt_max_length,
    beta=dpo_args["beta"],
    loss_type=dpo_args["loss_type"],
)



Map:   0%|          | 0/21120 [00:00<?, ? examples/s]

In [10]:
checkpoint_dir = Path(args.output_dir)
if checkpoint_dir.exists():
    checkpoints = list(checkpoint_dir.glob("checkpoint-*"))
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=lambda path: int(path.name.split("-")[-1]))
        print(f"Loading from checkpoint {latest_checkpoint}")
    else:
        latest_checkpoint = None
else:
    latest_checkpoint = None

# Continue training from the latest checkpoint if available
if latest_checkpoint:
    trainer.train(resume_from_checkpoint=str(latest_checkpoint))
else:
    trainer.train()

# save model at the end of training
trainer.save_model()

  0%|          | 0/1783 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
peft_model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

# Merge LoRA and base model and save
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained(args.output_dir, safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained(args.output_dir)