In [None]:
!pip install datasets bitsandbytes trl tensorboard -qq

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import DPOTrainer, DPOConfig, create_reference_model
from datasets import load_dataset
from pydantic import BaseModel
import gc

In [None]:
class Config(BaseModel):
    num_epochs: int = 10
    bnb_config: BitsAndBytesConfig = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type='nf4'
    )
    lora_config: LoraConfig = LoraConfig(
      r=16,
      lora_alpha=16,
      target_modules=['q_proj', 'k_proj', 'p_proj', 'o_proj'],
    )
    model_name: str = 'Qwen/Qwen2.5-0.5B-Instruct'
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    dpo_beta: float = 0.1
    batch_size: int = 2
    lr: float = 3e-5

config = Config()

In [None]:
policy = AutoModelForCausalLM.from_pretrained(config.model_name, trust_remote_code=True, quantization_config=config.bnb_config).to(config.device)
policy = prepare_model_for_kbit_training(policy)
policy = get_peft_model(policy, config.lora_config)
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
dataset = load_dataset('argilla/distilabel-intel-orca-dpo-pairs')

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/79.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12859 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.select_columns(['system', 'input', 'chosen', 'rejected'])

In [None]:
dataset = dataset['train'].train_test_split(test_size=0.1, shuffle=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['system', 'input', 'chosen', 'rejected'],
        num_rows: 11573
    })
    test: Dataset({
        features: ['system', 'input', 'chosen', 'rejected'],
        num_rows: 1286
    })
})

In [None]:
## DONT RUN
#raise
#
#max_len = 0
#
#for example in dataset['test']:
#  chosen_messages = [
#      {'role': 'system', 'content': example['system']},
#      {'role': 'user', 'content': example['input']},
#      {'role': 'assistant', 'content': example['chosen']}
#  ]
#  rejected_messages = [
#      {'role': 'system', 'content': example['system']},
#      {'role': 'user', 'content': example['input']},
#      {'role': 'assistant', 'content': example['rejected']}
#  ]
#
#  max_len = max(max_len, len(tokenizer.apply_chat_template(chosen_messages)))
#  max_len = max(max_len, len(tokenizer.apply_chat_template(rejected_messages)))
#
#print(max_len) # train - 2025, test - 1972

In [None]:
def preproc(example):
  chosen_messages = [
      {'role': 'system', 'content': example['system']},
      {'role': 'user', 'content': example['input']},
      {'role': 'assistant', 'content': example['chosen']}
  ]
  rejected_messages = [
      {'role': 'system', 'content': example['system']},
      {'role': 'user', 'content': example['input']},
      {'role': 'assistant', 'content': example['rejected']}
  ]

  tokenized_chosen = tokenizer.apply_chat_template(
      chosen_messages, return_dict=True, max_length=2048,
      padding='max_length', truncation=True
  )
  tokenized_chosen_input_ids = tokenized_chosen['input_ids']
  tokenized_chosen_attn_mask = tokenized_chosen['attention_mask']

  tokenized_rejected = tokenizer.apply_chat_template(
      rejected_messages, return_dict=True, max_length=2048,
      padding='max_length', truncation=True
  )
  tokenized_rejected_input_ids = tokenized_rejected['input_ids']
  tokenized_rejected_attn_mask = tokenized_rejected['attention_mask']

  # these indexes for gathering logps (1 shifted left tokens, 'cause logits show next token distribution)
  # for getting target idxs u need to add +1 to both of them (1 shifted right tokens)
  assistant_token_index_chosen = tokenized_chosen_input_ids.index(77091)
  eos_token_index_chosen = assistant_token_index_chosen + tokenized_chosen_input_ids[assistant_token_index_chosen:].index(151645) - 1
  assistant_token_index_rejected = tokenized_rejected_input_ids.index(77091)
  eos_token_index_rejected = assistant_token_index_rejected + tokenized_rejected_input_ids[assistant_token_index_rejected:].index(151645) - 1


  return {
      'tokenized_chosen_input_ids': tokenized_chosen_input_ids,
      'tokenized_chosen_attn_mask': tokenized_chosen_attn_mask,
      'tokenized_rejected_input_ids': tokenized_rejected_input_ids,
      'tokenized_rejected_attn_mask': tokenized_rejected_attn_mask,
      'assistant_token_index_chosen': assistant_token_index_chosen,
      'eos_token_index_chosen': eos_token_index_chosen,
      'assistant_token_index_rejected': assistant_token_index_rejected,
      'eos_token_index_rejected': eos_token_index_rejected
  }

In [None]:
dataset = dataset.map(preproc)

Map:   0%|          | 0/11573 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

In [None]:
dataset.set_format('torch')

In [None]:
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=config.batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=config.batch_size, shuffle=False)

In [None]:
for batch in train_loader:
  print(batch['tokenized_chosen_input_ids'].shape, batch['tokenized_chosen_input_ids'].dtype)
  print(batch['tokenized_chosen_attn_mask'].shape, batch['tokenized_chosen_attn_mask'].dtype)
  print(batch['tokenized_rejected_input_ids'].shape, batch['tokenized_rejected_input_ids'].dtype)
  print(batch['tokenized_rejected_attn_mask'].shape, batch['tokenized_rejected_attn_mask'].dtype)
  print(batch['assistant_token_index_chosen'].shape, batch['assistant_token_index_chosen'].dtype)
  print(batch['eos_token_index_chosen'].shape, batch['eos_token_index_chosen'].dtype)
  print(batch['assistant_token_index_rejected'].shape, batch['assistant_token_index_rejected'].dtype)
  print(batch['eos_token_index_rejected'].shape, batch['eos_token_index_rejected'].dtype)
  break

torch.Size([2, 2048]) torch.int64
torch.Size([2, 2048]) torch.int64
torch.Size([2, 2048]) torch.int64
torch.Size([2, 2048]) torch.int64
torch.Size([2]) torch.int64
torch.Size([2]) torch.int64
torch.Size([2]) torch.int64
torch.Size([2]) torch.int64


In [None]:
def dpo_loss(chosen_logps, rejected_logps, chosen_logps_ref, rejected_logps_ref):
  return -F.logsigmoid(config.dpo_beta * (chosen_logps/chosen_logps_ref - rejected_logps/rejected_logps_ref))

In [None]:
def calculate_logps(logits, targets):
  # logits.shape -> [batch_size, assistant_answer_len, vocab_size]
  # targets.shape -> [batch_size, assistant_answer_len]
  log_probs = F.log_softmax(logits, dim=-1)
  target_log_probs = torch.gather(log_probs, dim=-1, index=targets.unsqueeze(-1)).squeeze(-1) # [batch_size, assistant_answer_len]
  logps = target_log_probs.sum(dim=-1)
  return logps # [batch_size]

In [None]:
ref_policy = create_reference_model(policy)

In [None]:
print(sum([p.numel() for p in policy.parameters() if p.requires_grad]))
print(sum([p.numel() for p in ref_policy.parameters() if p.requires_grad]))

1769472
0


In [None]:
optim = torch.optim.AdamW(policy.parameters(), lr=config.lr)

In [None]:
gc.collect()
torch.cuda.empty_cache()

writer = SummaryWriter(log_dir='runs/dpo_training')
global_step = 0

chosen_logps_stepwise = []
rejected_logps_stepwise = []
chosen_logps_ref_stepwise = []
rejected_logps_ref_stepwise = []
losses_stepwise = []
for epoch in range(config.num_epochs):
  running_loss = 0.0
  policy.train()
  ref_policy.eval()
  for batch in tqdm(train_loader):
    tokenized_chosen_input_ids = batch['tokenized_chosen_input_ids'].to(config.device) # [batch_size, seq_len]
    tokenized_chosen_attn_mask = batch['tokenized_chosen_attn_mask'].to(config.device) # [batch_size, seq_len]
    tokenized_rejected_input_ids = batch['tokenized_rejected_input_ids'].to(config.device) # [batch_size, seq_len]
    tokenized_rejected_attn_mask = batch['tokenized_rejected_attn_mask'].to(config.device) # [batch_size, seq_len]
    assistant_token_index_chosen = batch['assistant_token_index_chosen'].to(config.device) # [batch_size]
    eos_token_index_chosen = batch['eos_token_index_chosen'].to(config.device) # [batch_size]
    assistant_token_index_rejected = batch['assistant_token_index_rejected'].to(config.device) # [batch_size]
    eos_token_index_rejected = batch['eos_token_index_rejected'].to(config.device) # [batch_size]

    # [batch_size, seq_len, vocab_size]
    logits_chosen = policy(input_ids=tokenized_chosen_input_ids, attention_mask=tokenized_chosen_attn_mask).logits
    logits_rejected = policy(input_ids=tokenized_rejected_input_ids, attention_mask=tokenized_rejected_attn_mask).logits
    with torch.no_grad():
      logits_ref_chosen = ref_policy(input_ids=tokenized_chosen_input_ids, attention_mask=tokenized_chosen_attn_mask).logits
      logits_ref_rejected = ref_policy(input_ids=tokenized_rejected_input_ids, attention_mask=tokenized_rejected_attn_mask).logits

    chosen_logps_list = []
    rejected_logps_list = []
    chosen_logps_ref_list = []
    rejected_logps_ref_list = []
    for i in range(tokenized_chosen_input_ids.size(0)): # iterate over batch_size (not from config to handle last batch)
      start_idx_chosen = assistant_token_index_chosen[i].item()
      end_idx_chosen = eos_token_index_chosen[i].item()
      start_idx_rejected = assistant_token_index_rejected[i].item()
      end_idx_rejected = eos_token_index_rejected[i].item()

      # [assistant_answer_len, vocab_size]
      sliced_logits_chosen = logits_chosen[i, start_idx_chosen:end_idx_chosen, :]
      sliced_logits_rejected = logits_rejected[i, start_idx_rejected:end_idx_rejected, :]
      sliced_logits_ref_chosen = logits_ref_chosen[i, start_idx_chosen:end_idx_chosen, :]
      sliced_logits_ref_rejected = logits_ref_rejected[i, start_idx_rejected:end_idx_rejected, :]
      # [assistant_answer_len]
      sliced_target_logits_chosen = tokenized_chosen_input_ids[i, start_idx_chosen+1:end_idx_chosen+1]
      sliced_target_logits_rejected = tokenized_rejected_input_ids[i, start_idx_rejected+1:end_idx_rejected+1]

      # [1] -> as we iterate over batch_size
      chosen_logps = calculate_logps(sliced_logits_chosen.unsqueeze(0), sliced_target_logits_chosen.unsqueeze(0))
      rejected_logps = calculate_logps(sliced_logits_rejected.unsqueeze(0), sliced_target_logits_rejected.unsqueeze(0))
      chosen_logps_ref = calculate_logps(sliced_logits_ref_chosen.unsqueeze(0), sliced_target_logits_chosen.unsqueeze(0))
      rejected_logps_ref = calculate_logps(sliced_logits_ref_rejected.unsqueeze(0), sliced_target_logits_rejected.unsqueeze(0))

      chosen_logps_list.append(chosen_logps)
      rejected_logps_list.append(rejected_logps)
      chosen_logps_ref_list.append(chosen_logps_ref)
      rejected_logps_ref_list.append(rejected_logps_ref)

    # scalar -> mean logps across the batch
    chosen_logps = torch.stack(chosen_logps_list).mean()
    rejected_logps = torch.stack(rejected_logps_list).mean()
    chosen_logps_ref = torch.stack(chosen_logps_ref_list).mean()
    rejected_logps_ref = torch.stack(rejected_logps_ref_list).mean()

    chosen_logps_stepwise.append(chosen_logps.item())
    rejected_logps_stepwise.append(rejected_logps.item())
    chosen_logps_ref_stepwise.append(chosen_logps_ref.item())
    rejected_logps_ref_stepwise.append(rejected_logps_ref.item())

    # calculate DPO loss
    loss = dpo_loss(chosen_logps, rejected_logps, chosen_logps_ref, rejected_logps_ref)
    running_loss += loss.item()
    losses_stepwise.append(loss.item())

    writer.add_scalar('Logs/Batch', loss.item(), global_step)
    writer.add_scalar('Log_Probs/Chosen', chosen_logps.item(), global_step)
    writer.add_scalar('Log_Probs/Rejected', rejected_logps.item(), global_step)
    writer.add_scalar('Log_Probs/Chosen_Ref', chosen_logps_ref.item(), global_step)
    writer.add_scalar('Log_Probs/Rejected_Ref', rejected_logps_ref.item(), global_step)

    optim.zero_grad()
    loss.backward()
    optim.step()

    global_step += 1

  epoch_loss = running_loss/len(train_loader)
  writer.add_scalar('Loss/Epoch', epoch_loss, epoch)
  print(f'Epoch: {epoch}, DPO Loss: {epoch_loss}')

writer.close()

  0%|          | 0/5787 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  0%|          | 0/5787 [00:06<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.32 GiB. GPU 0 has a total capacity of 14.74 GiB of which 948.12 MiB is free. Process 8794 has 13.81 GiB memory in use. Of the allocated memory 13.31 GiB is allocated by PyTorch, and 391.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/runs

In [None]:
for example in dataset['train']:
  chosen_messages = [
      {'role': 'system', 'content': example['system']},
      {'role': 'user', 'content': example['input']},
      {'role': 'assistant', 'content': example['chosen']}
  ]
  chosen_only = [chosen_messages[2]]
  tokenized_chosen = tokenizer.apply_chat_template(chosen_messages, return_tensors='pt').to(device)
  chosen_only_tokenized = tokenizer.apply_chat_template(chosen_only, return_tensors='pt').to(device)
  print(tokenizer.decode(tokenized_chosen[0]))
  print(tokenizer.decode(chosen_only_tokenized[0]))
  print('---------')
  print(tokenizer.decode(chosen_only_tokenized[0][chosen_only_tokenized[0].tolist().index(77091)-1:]))
  print('----------')
  print(tokenizer.decode([151644, 77091]))
  print('==========')
  print(len(tokenized_chosen[0]))
  logits = model(tokenized_chosen).logits
  ####### very imp
  first_border = tokenized_chosen[0].tolist().index(77091)
  second_border = tokenized_chosen[0].tolist()[first_border:].index(151645)
  print(tokenizer.decode(
      tokenized_chosen[0][first_border:first_border+second_border-1]
      ))
  print('TARGETS:')
  print(tokenizer.decode(tokenized_chosen[0][first_border+1:first_border+second_border]))
  print('TARGETS^')
  print('CONVENIENT WAY:')
  print(tokenized_chosen[0][first_border:first_border+second_border-1])
  print(tokenized_chosen[0][first_border+1:first_border+second_border])
  print('CONVENIENT WAY^')
  ####### very imp
  print('=11111111111111=')
  #print(tokenized_chosen[0].tolist())
  print('=11111111111111=')
  chosen_logits = logits[:, tokenized_chosen[0].tolist().index(77091)-2:, :]
  print('------======')
  print(chosen_logits)
  print('------======')
  print('=-=-=-=-=-=-=')
  print(tokenizer.decode(torch.argmax(chosen_logits, dim=-1)[0]))
  print('=-=-=-=-=-=-=')
  print(logits.shape)
  print('=====')
  print(tokenizer.decode([77091]))
  print(tokenizer.decode([151645]))
  break

<|im_start|>system
You are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>
<|im_start|>user
Tweet: Sunbathing &amp; Revising 
Predict the sentiment of this tweet.<|im_end|>
<|im_start|>assistant
As an AI text-based model, I am unable to directly predict sentiments like a sentiment analysis tool would. However, I can provide you with an analysis of the tweet which might help in determining the sentiment behind it.

The tweet appears to mention two activities: "Sunbathing" and "Revising." Sunbathing is often associated with relaxation, enjoyment, and taking a break from daily routines, as individuals spend time outdoors under the sun to get a tan or unwind. This could be an indication of a positive sentiment, as the person may be enjoying some leisure time or taking a break from work.

On the other hand, "Revising" typically refers to the activity of reviewing or modifying existing work like study materials, articles, or documents. Revis

In [None]:
test = 'What is the right way to say "I love you"?'

messag = [
    {'role': 'system', 'content': 'You are a helpfull assistant'},
    {'role': 'user', 'content': test}
]
tokenized = tokenizer.apply_chat_template(messag, return_tensors='pt', add_generation_prompt=True, return_dict=True).to(device)
tokenizer.decode(model.generate(**tokenized, max_length=1024)[0])

'<|im_start|>system\nYou are a helpfull assistant<|im_end|>\n<|im_start|>user\nWhat is the right way to say "I love you"?<|im_end|>\n<|im_start|>assistant\nThe correct way to say "I love you" in English is:\n\n"I really, really like you."\n\nThis phrase conveys affection and deep emotion. It\'s commonly used in both casual and formal settings to express your feelings towards someone.<|im_end|>'