In [None]:
# Install necessary packages
!pip install bitsandbytes peft trl --quiet
!pip install --upgrade datasets --quiet

In [None]:
# Import libraries
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import DPOTrainer
import torch
import wandb
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

In [None]:
# Load dataset
data = load_dataset("August4293/Preference-Dataset", split="train")

In [None]:
# Preprocessing function
def preprocess(data):
    data['prompt'] = '<s>[INST]' + data['prompt'] + '[/INST]'
    data['chosen'] += '</s>'
    data['rejected'] += '</s>'
    return data

In [None]:
data = data.map(preprocess).train_test_split(test_size=0.1)
train_dataset = data['train']
eval_dataset = data['test']

In [None]:
# Retrieve secrets
user_secrets = UserSecretsClient()
wandb_token = user_secrets.get_secret("wandb_august")
HF_token = user_secrets.get_secret("HF_august")

In [None]:
# Login to Hugging Face
login(HF_token)

In [None]:
# Define run notes
dataset_size = len(train_dataset)
num_of_epochs = 1
notes = f"Initial DPO test run on sample dataset of {dataset_size} and {num_of_epochs} epochs"

In [None]:
# Initialize WandB run
wandb.login(key=wandb_token)
run = wandb.init(
    project='mistral self-alignment',
    job_type="training",
    name="test run with DPO",
    notes=notes
)

In [None]:
# Set base model and tokenizer
base_model = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [None]:
# Configure BitsAndBytes and Lora
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/checkpoints",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    save_strategy="epoch",
    save_steps=1,
    logging_steps=10,
    learning_rate=5e-5,
    warmup_ratio=0.03,
    evaluation_strategy="steps",
    eval_steps=10
)

In [None]:
# Initialize DPOTrainer
trainer = DPOTrainer(
    model=model,
    peft_config=peft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    tokenizer=tokenizer,
    beta=0.1
)

In [None]:
# Train the model
trainer.train()

# Finish WandB run
wandb.finish()

In [None]:
# Save and push fine-tuned model to Hugging Face Hub
fine_tuned_model_name = "mistral_self_alignment_DPO"
trainer.model.save_pretrained(fine_tuned_model_name)
commit_message = "Initial adapter with DPO on sample dataset and 1 epoch"
trainer.model.push_to_hub(fine_tuned_model_name, commit_message=commit_message, use_temp_dir=False)
tokenizer.push_to_hub(fine_tuned_model_name, commit_message=commit_message, use_temp_dir=False)