In [1]:
import torch
import os
import logging
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
os.environ['HF_TOKEN'] = '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'
# Check if GPU is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    for gpu_id in range(num_gpus):
        print(f"GPU {gpu_id}: {torch.cuda.get_device_name(gpu_id)}")
else:
    print("No GPU available.")

#logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.setLevel( logging.DEBUG )

Number of available GPUs: 4
GPU 0: NVIDIA A40
GPU 1: NVIDIA A40
GPU 2: NVIDIA A40
GPU 3: NVIDIA A40


In [2]:
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
from peft import LoraConfig, prepare_model_for_kbit_training
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16 , #64 is expected
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
hg_legal_model = "Dhananjayg22/legal-triplet-extractor"

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    hg_legal_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

  from .autonotebook import tqdm as notebook_tqdm
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.79s/it]


In [3]:
from datasets import Dataset
dataset = Dataset.load_from_disk("dataset/dpo")

from transformers import (
    AutoTokenizer
)

base_model = "google/gemma-7b-it"

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [4]:
raw_dataset = dataset.train_test_split(test_size=0.3)
raw_dataset 

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 210
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 90
    })
})

In [5]:
from trl import DPOTrainer
from transformers import TrainingArguments


training_args = TrainingArguments(
    bf16=True,
    do_eval=True,
    evaluation_strategy="epoch",
    eval_steps=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant":False},
    learning_rate=5.0e-6,
    log_level="info",
    logging_steps=10,
    lr_scheduler_type="cosine",
    #generation_max_length=5000,
    report_to="wandb",
    #max_prompt_length=3500,
    num_train_epochs=10,
    optim="paged_adamw_32bit",
    output_dir="dpo-output",  # It is handy to append `hub_model_revision` to keep track of your local experiments
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    # push_to_hub=True,
    save_strategy="epoch",
    save_steps=1,
    seed=42,
    warmup_ratio=0.1,
)


trainer = DPOTrainer(
        model,
        ref_model=None,
        model_init_kwargs=None,
        ref_model_init_kwargs=None,
        args=training_args,
        #beta=training_args.beta,
        train_dataset=raw_dataset["train"],
        eval_dataset=raw_dataset["test"],
        tokenizer=tokenizer,
        #max_length=training_args.max_length,
        #max_prompt_length=training_args.max_prompt_length,
        peft_config=peft_config,
        #loss_type=training_args.loss_type,
        beta=0.1,
    )


Map: 100%|██████████| 210/210 [00:04<00:00, 43.66 examples/s]
Map: 100%|██████████| 90/90 [00:02<00:00, 40.87 examples/s]
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend


In [6]:
trainer.train()

***** Running training *****
  Num examples = 210
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 130
  Number of trainable parameters = 25,001,984
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mdhananjay62-dg[0m ([33miitk-kgp[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
0,0.6926,0.57025,-0.831964,-1.143021,0.697917,0.311057,-1332.475464,-1255.26062,107.523521,104.767555
1,0.4805,0.18932,-1.427116,-4.347171,0.927083,2.920054,-1364.516968,-1261.212036,103.366615,101.53183
2,0.1926,0.121212,-1.779926,-6.46535,0.9375,4.685424,-1385.698853,-1264.740112,101.163536,99.777122
4,0.0098,0.109007,-5.360107,-12.836131,0.958333,7.476023,-1449.406616,-1300.541992,95.027596,94.118263
5,0.0116,0.116111,-4.759645,-12.993365,0.947917,8.233719,-1450.978882,-1294.537231,94.20929,93.569023
6,0.0017,0.128513,-5.595091,-14.017463,0.9375,8.422371,-1461.219849,-1302.891724,92.929016,92.268959
8,0.0039,0.127205,-5.601404,-14.104871,0.947917,8.503467,-1462.094116,-1302.955078,92.759941,92.101097
9,0.0031,0.119863,-5.623674,-14.050708,0.947917,8.427034,-1461.552368,-1303.177612,92.735657,92.072075


***** Running Evaluation *****
  Num examples = 90
  Batch size = 8
Saving model checkpoint to dpo-output/checkpoint-13
loading configuration file config.json from cache at /DATA4/shared_cache/huggingface/hub/models--Dhananjayg22--legal-triplet-extractor/snapshots/70efd53e321ed568870fde74cfde7b297aa40cb2/config.json
Model config GemmaConfig {
  "_name_or_path": "google/gemma-7b-it",
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_activation": null,
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 24576,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 16,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.39.3",
  "use_ca

TrainOutput(global_step=130, training_loss=0.11711677548547204, metrics={'train_runtime': 12025.7207, 'train_samples_per_second': 0.175, 'train_steps_per_second': 0.011, 'total_flos': 0.0, 'train_loss': 0.11711677548547204, 'epoch': 9.81})