In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.cache_utils import DynamicCache
import torch
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

checkpoint = "HuggingFaceTB/SmolLM-135M"


tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint,torch_dtype=torch.bfloat16).to(device)

tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

**We will use huggingface model pretrained on large amount of text via next token prediction (Casual language modelling)**


for ref: https://github.com/Ajax0564/VyomAI/blob/main/Examples/vyom-ai-decoder_clm.ipynb


In [2]:
import json
import os

with open('../input/sft_data.json', "r", encoding="utf-8") as file:
        sft_data = json.load(file)

In [3]:
print("Example entry:\n", sft_data[2])

Example entry:
 {'instruction': 'Describe the structure of an atom.', 'input': '', 'output': 'An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.'}


In [4]:
def format_query(entry):
    instruction_text = (f"Write a response that appropriately completes the request."f"\n### Instruction:\n{entry['instruction']}")
    input_text = f"\n### Input:\n{entry['input']}" if entry["input"] else ""
    return instruction_text+input_text


In [5]:
model_input = format_query(sft_data[2])
desired_response = f"\n### Assistant:\n{sft_data[2]['output']}"

print(model_input + desired_response)

Write a response that appropriately completes the request.
### Instruction:
Describe the structure of an atom.
### Assistant:
An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.


In [6]:
train_portion = int(len(sft_data) * 0.95)  # 95% for training
val_portion = len(sft_data) - train_portion 

train_data = sft_data[:train_portion]
val_data = sft_data[train_portion:]
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))

Training set length: 285
Validation set length: 15


we can use padding on every pair inside SFTDataset but for large and small pairs it will be insufficient


best way to use collate func to handle the batch and make each batch dynamic in length based on on max seq present in that batch

In [7]:
import torch
from torch.utils.data import Dataset


class SFTDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Store Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_query(entry)
            response_text = f"\n### Assistant:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
         return len(self.data)

In [8]:
print(tokenizer.encode("<|endoftext|>"))

[0]


In [9]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [10]:
def sft_collate(
    batch,
    pad_token_id=0,
    ignore_index=-100,
    allowed_max_length=None,
    device=device
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst,mask_ = [], [],[]

    for item in batch:
        new_item = item.copy()
        
        # Pad sequences to max_length
        padded = (new_item + [pad_token_id] *(batch_max_length - len(item))) #right padding
        attn_mask = torch.ones(len(padded)) 
        attn_mask[len(item):] = 0
        inputs = torch.tensor(padded)  
        targets = torch.tensor(padded)  

        # Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices] = ignore_index

        # Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
            attn_mask = attn_mask[:allowed_max_length]
            

        inputs_lst.append(inputs)
        targets_lst.append(targets)
        mask_.append(attn_mask)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    attn_mask = torch.stack(mask_).to(device)

    return inputs_tensor,attn_mask, targets_tensor

In [11]:
from torch.utils.data import DataLoader


num_workers = 0
batch_size = 4

torch.manual_seed(123)

train_dataset = SFTDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=sft_collate,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

In [12]:
val_dataset = SFTDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=sft_collate,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [13]:
for i,(inputs,attn_mask, targets) in enumerate(train_loader):
    print(inputs.shape,attn_mask.shape, targets.shape)
    if i==5:
        break
    

torch.Size([4, 127]) torch.Size([4, 127]) torch.Size([4, 127])
torch.Size([4, 522]) torch.Size([4, 522]) torch.Size([4, 522])
torch.Size([4, 292]) torch.Size([4, 292]) torch.Size([4, 292])
torch.Size([4, 134]) torch.Size([4, 134]) torch.Size([4, 134])
torch.Size([4, 180]) torch.Size([4, 180]) torch.Size([4, 180])
torch.Size([4, 339]) torch.Size([4, 339]) torch.Size([4, 339])


In [14]:
def make_query(q):
    prompt = f"Write a response that appropriately completes the request."f"\n### Instruction:\n{q}"
    return prompt+f"\n### Assistant:\n"

**Model Without SFT**

In [15]:
messages = "Describe the structure of an atom"
input_text = make_query(messages)
input_text

'Write a response that appropriately completes the request.\n### Instruction:\nDescribe the structure of an atom\n### Assistant:\n'

In [16]:
messages = ["Describe the structure of an atom"]
input_text = make_query(messages[0])
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=20,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Write a response that appropriately completes the request.
### Instruction:
Describe the structure of an atom
### Assistant:

### Instructions:

### 1. Read the following information about the atom.




In [17]:
messages = "Generate a list of ten items a person might need for a camping trip"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=20,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Generate a list of ten items a person might need for a camping trip
### Assistant:

### Directions:

### Instructions:

### Directions:

### Instructions:



In [18]:
def sft_loss_fn(labels, prediction_scores,vocab_size=49152):
    prediction_scores = prediction_scores[:, :-1, :].contiguous()
    labels = labels[:, 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    lm_loss = loss_fct(
        prediction_scores.view(-1,vocab_size), labels.view(-1)
    )
    return lm_loss

In [19]:
import numpy as np
from tqdm.notebook import tqdm
def sft_train(model=model,train_loader=train_loader):
    model.train()
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
    Epochs = 3
    
    epoch_check = len(train_loader)
    total_step = epoch_check * Epochs
    
    train_bar = tqdm(total=total_step, dynamic_ncols=True)
    
    for epoch in range( Epochs):
        loss_list = []
        for step, (inputs,attn_mask, targets) in enumerate(train_loader):
            train_bar.update(1)
            optimizer.zero_grad()
            pred = model(input_ids=inputs,attention_mask=attn_mask).logits
            loss = sft_loss_fn(targets, pred)
            loss.backward()
            optimizer.step()
            loss_list.append(loss.detach().cpu().item())
            # if step>0 and step%50==0:
            #      avg_loss = np.round(np.mean(loss_list), 4)
            #      print(f"Epoch--{epoch+1}--step--{step} ### Train loss---{avg_loss}")

        avg_loss = np.round(np.mean(loss_list), 4)
        print(f"Epoch--{epoch+1} ### Train loss---{avg_loss}")
        


In [20]:
sft_train()

  0%|          | 0/355 [00:00<?, ?it/s]

Epoch--1 ### Train loss---2.0443
Epoch--2 ### Train loss---1.7663
Epoch--3 ### Train loss---1.6916
Epoch--4 ### Train loss---1.6504
Epoch--5 ### Train loss---1.6256


**Model after SFT**

In [21]:
messages = "Describe the structure of an atom"
input_text = make_query(messages)
input_text

'Write a response that appropriately completes the request.\n### Instruction:\nDescribe the structure of an atom\n### Assistant:\n'

In [22]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [23]:
messages = "Describe the structure of an atom"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Describe the structure of an atom
### Assistant:
An atom is a subatomic particle that consists of a positively charged nucleus surrounded by a cloud of negatively charged electrons. The nucleus is made up of protons and neutrons, while the electrons orbit the nucleus in a circular path. The outermost shell of an atom


In [24]:
messages = "Give tips for staying healthy"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Give tips for staying healthy
### Assistant:
Eating a balanced diet is important for staying healthy. Eating a variety of fruits and vegetables, whole grains, lean proteins, and healthy fats can help you get the nutrients you need. It's also important to stay hydrated by drinking plenty of water throughout the


In [25]:
messages = 'Explain the use of word embeddings in Natural Language Processing'
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Explain the use of word embeddings in Natural Language Processing
### Assistant:
Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation. They are a type of word representation that is used to represent words in a vector space. Word embeddings are used in Natural Language Processing (NLP)


In [26]:
model.save_pretrained('./')

**DPO**

In [28]:
import json
import os

with open('../input/Neural-DPO.jsonl', "r", encoding="utf-8") as file:
        data = json.load(file)

In [29]:
data[1]

{'system': 'You are an A.I assistant that has access to a vast library of information about neural networks',
 'question': "What is the significance of the parameter-efficient expert Ai(x) formula presented in the paper 'Parameter-Efficient Sparsity Crafting from Dense to Mixture-of-Experts for Instruction Tuning on General Tasks'?",
 'chosen': 'The parameter-efficient expert Ai(x) formula presented in the paper introduces adapters with minimal parameters to the model, enabling differentiation between experts and scaling of model capacity while preserving sparsity.',
 'rejected': 'The Ai(x) formula increases the complexity of the model by adding a large number of parameters, contrary to what the paper suggests.'}

In [30]:
dpo_data = []
for entry in data:
    if entry.get('rejected',None):
        dpo_data.append(entry)
    

In [31]:
train_portion = int(len(dpo_data) * 0.95)  # 85% for training

val_portion = len(dpo_data) - train_portion  # Remaining 
train_data = dpo_data[:train_portion]
val_data = dpo_data[train_portion:]
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))

Training set length: 1008
Validation set length: 54


In [32]:
def format_query(entry):
    instruction_text = (f"Write a response that appropriately completes the request."f"\n### Instruction:\n{entry['question']}")

    return instruction_text 


In [33]:
import torch
from torch.utils.data import Dataset


class DPODataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            prompt = format_query(entry)
            rejected_response = entry["rejected"]
            chosen_response = entry["chosen"]

            prompt_tokens = tokenizer.encode(prompt)
            
            chosen_full_text = prompt+f"\n### Assistant:\n{chosen_response}"
            rejected_full_text = prompt+f"\n### Assistant:\n{rejected_response}"
            chosen_full_tokens = tokenizer.encode(chosen_full_text)
            rejected_full_tokens = tokenizer.encode(rejected_full_text)

            self.encoded_texts.append({
                "prompt": prompt_tokens,
                "chosen": chosen_full_tokens,
                "rejected": rejected_full_tokens,
            })

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [34]:
def dpo_collate(
    batch,
    pad_token_id=0,
    allowed_max_length=None,
    mask_prompt_tokens=True,
    device=device
):
    # Initialize lists to hold batch data
    batch_data = {
        "prompt": [],
        "chosen": [],
        "rejected": [],
        "rejected_mask": [],
        "chosen_mask": []

    }

    # Determine the longest sequence to set a common padding length
    max_length_common = 0
    if batch:
        for key in ["chosen", "rejected"]:
            current_max = max(len(item[key])+1 for item in batch)
            max_length_common = max(max_length_common, current_max)
    # Process each item in the batch
    for item in batch:
        prompt = torch.tensor(item["prompt"])
        batch_data["prompt"].append(prompt)

        for key in ["chosen", "rejected"]:
            # Adjust padding according to the common maximum length
            sequence = item[key]
            padded = sequence + [pad_token_id] * (max_length_common - len(sequence)) #right padding
            mask = torch.ones(len(padded))

            # Set mask for all padding tokens to False
            mask[len(sequence):] = 0

            # Set mask for all input tokens to False
            # +1 sets the 1 newline ("\n") tokens before "### Assistant" to False
            if mask_prompt_tokens:
                mask[:prompt.shape[0]+1] =0

            batch_data[key].append(torch.tensor(padded))
            batch_data[f"{key}_mask"].append(mask)

    # Final processing
    for key in ["chosen", "rejected", "chosen_mask", "rejected_mask"]:
        # Stack all sequences into a tensor for the given key
        tensor_stack = torch.stack(batch_data[key])

        # Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            tensor_stack = tensor_stack[:, :allowed_max_length]

        # Move to the specified device
        batch_data[key] = tensor_stack.to(device)

    return batch_data

In [35]:
# from functools import partial

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Device:", device)

# collate_fn = partial(
#     my_collate_fn,
#     device=device,            # Put the data directly on a GPU if available
#     mask_prompt_tokens=True,  # This is optional
#     allowed_max_length=768   # The supported context length of the model
# )

In [36]:
num_workers = 0
batch_size = 4

torch.manual_seed(123)

train_dataset =DPODataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=dpo_collate,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers)

In [37]:
val_dataset = DPODataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=dpo_collate,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [38]:
print("Train loader:")
for batch in train_loader:
    print(
        batch["chosen"].shape,
        batch["chosen_mask"].shape,
        batch["rejected"].shape,
    )
    break

Train loader:
torch.Size([4, 467]) torch.Size([4, 467]) torch.Size([4, 467])


- Now, we are almost ready to get to the DPO part
- As mentioned at the beginning of this notebook, DPO works with two LLMs: a policy model (the LLM that we want to optimize) and a reference model (the original model that we keep unchanged)
- Below, we rename the model as policy_model and instantiate a second instance of the model we refer to as the reference_model

In [40]:
base_model = AutoModelForCausalLM.from_pretrained('../working',torch_dtype=torch.bfloat16).to(device)


In [41]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [42]:
import torch.nn.functional as F

def compute_dpo_loss(
      model_chosen_logprobs,
      model_rejected_logprobs,
      reference_chosen_logprobs,
      reference_rejected_logprobs,
      beta=0.1,
    ):
    """Compute the DPO loss for a batch of policy and reference model log probabilities.

    Args:
        policy_chosen_logprobs: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
        policy_rejected_logprobs: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
        reference_chosen_logprobs: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
        reference_rejected_logprobs: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
        beta: Temperature parameter for the DPO loss; typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0.
        label_smoothing: conservativeness for DPO loss.

    Returns:
        A tuple of three tensors: (loss, chosen_rewards, rejected_rewards).
    """

    model_logratios = model_chosen_logprobs - model_rejected_logprobs
    reference_logratios = reference_chosen_logprobs - reference_rejected_logprobs
    logits = model_logratios - reference_logratios

    # DPO (Eq. 7 of https://arxiv.org/pdf/2305.18290.pdf)
    losses = -F.logsigmoid(beta * logits)

    # Optional values to track progress during training
    chosen_rewards = (model_chosen_logprobs - reference_chosen_logprobs).detach()
    rejected_rewards = (model_rejected_logprobs - reference_rejected_logprobs).detach()

    # .mean() to average over the samples in the batch
    return losses.mean(), chosen_rewards.mean(), rejected_rewards.mean()

In [43]:
def compute_logprobs(logits, labels, selection_mask=None):
    """
    Compute log probabilities.

    Args:
      logits: Tensor of shape (batch_size, num_tokens, vocab_size)
      labels: Tensor of shape (batch_size, num_tokens)
      selection_mask: Tensor for shape (batch_size, num_tokens)

    Returns:
      mean_log_prob: Mean log probability excluding padding tokens.
    """

    # Labels are the inputs shifted by one
    labels = labels[:, 1:].clone()

    # Truncate logits to match the labels num_tokens
    logits = logits[:, :-1, :]

    log_probs = F.log_softmax(logits, dim=-1)

    # Gather the log probabilities for the actual labels
    selected_log_probs = torch.gather(
        input=log_probs,
        dim=-1,
        index=labels.unsqueeze(-1)
    ).squeeze(-1)

   
    mask = selection_mask[:, 1:].clone()

        # Apply the mask to filter out padding tokens
    selected_log_probs = selected_log_probs * mask

        # Calculate the average log probability excluding padding tokens
        # This averages over the tokens, so the shape is (batch_size, num_tokens) #where is value is probability of token
    avg_log_prob = selected_log_probs.sum(-1) / mask.sum(-1)

    return avg_log_prob

   

In [44]:
def compute_dpo_loss_batch(batch, model, ref_model, beta):
    """Compute the DPO loss on an input batch"""


    policy_chosen_log_probas = compute_logprobs(
        logits=model(input_ids=batch["chosen"]).logits,
        labels=batch["chosen"],
        selection_mask=batch["chosen_mask"]
    )
    policy_rejected_log_probas = compute_logprobs(
        logits=model(input_ids=batch["rejected"]).logits,
        labels=batch["rejected"],
        selection_mask=batch["rejected_mask"]
    )
    
    with torch.no_grad():#ref model will remain same
        ref_chosen_log_probas = compute_logprobs(
            logits=ref_model(input_ids=batch["chosen"]).logits,
            labels=batch["chosen"],
            selection_mask=batch["chosen_mask"]
        )
        ref_rejected_log_probas = compute_logprobs(
            logits=ref_model(input_ids=batch["rejected"]).logits,
            labels=batch["rejected"],
            selection_mask=batch["rejected_mask"]
        )
    loss, chosen_rewards, rejected_rewards = compute_dpo_loss(
        model_chosen_logprobs=policy_chosen_log_probas,
        model_rejected_logprobs=policy_rejected_log_probas,
        reference_chosen_logprobs=ref_chosen_log_probas,
        reference_rejected_logprobs=ref_rejected_log_probas,
        beta=beta
    )
    return loss, chosen_rewards, rejected_rewards



In [45]:
out = base_model(input_ids=batch["chosen"]).logits
out.size()

torch.Size([4, 467, 49152])

In [46]:
with torch.no_grad():
    loss = compute_dpo_loss_batch(batch, base_model, model, beta=0.1)
print(loss)

(tensor(0.6931, device='cuda:0'), tensor(0., device='cuda:0'), tensor(0., device='cuda:0'))


In [50]:
import numpy as np

def dpo_train(model=base_model,ref_model=model,train_loader=train_loader):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=0.01)
    EPOCHS = 1
    
    epoch_check = len(train_loader)
    total_step = epoch_check * EPOCHS
    train_bar = tqdm(total=total_step, dynamic_ncols=True)
    for epoch in range( EPOCHS):
        loss_list = []
        cr,rr = [],[]
        for step, batch in enumerate(train_loader):
            optimizer.zero_grad()
            train_bar.update(1)
            loss, chosen_rewards, rejected_rewards = compute_dpo_loss_batch(
                batch,
                model,
                ref_model,
                beta=0.1
            )
            loss.backward()
            optimizer.step()
            loss_list.append(loss.detach().cpu().item())
            cr.append(chosen_rewards.detach().cpu().item())
            rr.append(rejected_rewards.detach().cpu().item())
            # if step>0 and step%50==0:
            #      avg_loss = np.round(np.mean(loss_list), 4)
            #      avg_c = np.round(np.mean(cr), 4)
            #      avg_r = np.round(np.mean(rr), 4)
            #      print(f"Epoch--{epoch+1}--step--{step} ### Train loss---{avg_loss}")
            #      print(f"Epoch--{epoch+1}--step--{step} ### chosen---{avg_c} ### rejected---{avg_r}")

        avg_loss = np.round(np.mean(loss_list), 4)
        avg_c = np.round(np.mean(cr), 4)
        avg_r = np.round(np.mean(rr), 4)
        print(f"Epoch--{epoch+1} ### Train loss---{avg_loss}")
        print(f"Epoch--{epoch+1}--step--{step} ### chosen---{avg_c} ### rejected---{avg_r}")

In [51]:
dpo_train()

  0%|          | 0/252 [00:00<?, ?it/s]

Epoch--1--step--50 ### Train loss---0.693
Epoch--1--step--50 ### chosen----0.0026 ### rejected----0.0062
Epoch--1--step--100 ### Train loss---0.6927
Epoch--1--step--100 ### chosen----0.0053 ### rejected----0.0133
Epoch--1--step--150 ### Train loss---0.6925
Epoch--1--step--150 ### chosen----0.0116 ### rejected----0.0253
Epoch--1--step--200 ### Train loss---0.6921
Epoch--1--step--200 ### chosen----0.0188 ### rejected----0.0393
Epoch--1--step--250 ### Train loss---0.6916
Epoch--1--step--250 ### chosen----0.0288 ### rejected----0.059
Epoch--1 ### Train loss---0.6916
Epoch--1--step--251 ### chosen----0.0291 ### rejected----0.0594


In [52]:
base_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

**base model is out finetunned model after DPO\
where model is previous SFT model**

In [53]:
messages = "Give tips for staying healthy"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Give tips for staying healthy
### Assistant:

Stay hydrated and eat a balanced diet.
### Assistant:

Drink plenty of water throughout the day.
### Assistant:

Eat a balanced diet with plenty of fruits, vegetables, whole grains, lean proteins, and healthy fats


In [54]:
messages = "Give tips for staying healthy"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = base_model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Give tips for staying healthy
### Assistant:

* Eat a balanced diet rich in fruits, vegetables, lean proteins, and whole grains.
* Get regular exercise, such as walking, swimming, or playing sports.
* Practice stress management techniques like meditation, deep breathing, or yoga.


In [55]:
messages = "Describe the structure of an atom"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Describe the structure of an atom
### Assistant:
An atom is a subatomic particle that consists of a nucleus and electrons. The nucleus contains protons and neutrons, while the electrons orbit the nucleus. The nucleus is made up of protons and neutrons, while the electrons orbit the nucleus. The nucleus is made


In [56]:
messages = "Describe the structure of an atom"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = base_model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Describe the structure of an atom
### Assistant:
An atom is a fundamental unit of matter. It consists of a nucleus surrounded by electrons, which are negatively charged particles. The nucleus is the center of an atom, and it contains protons and neutrons. Electrons are negatively charged particles, and they orbit


In [57]:
messages = "Explain the use of word embeddings in Natural Language Processing"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Explain the use of word embeddings in Natural Language Processing
### Assistant:
Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation. They are a type of word representation that is a directed acyclic graph (DAG) of words. Word embeddings are used in Natural Language Processing


In [58]:
messages = "Explain the use of word embeddings in Natural Language Processing"
input_text = make_query(messages)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = base_model.generate(inputs, max_new_tokens=50,temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a response that appropriately completes the request.
### Instruction:
Explain the use of word embeddings in Natural Language Processing
### Assistant:
Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation. They are a type of word representation that is used in Natural Language Processing (NLP) to represent words in a way that is easier to understand and
