In [None]:
!pip install -q datasets bitsandbytes einops wandb

In [None]:
!pip install -q -U trl

## Load Base Model

In [None]:
# Load model directly
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)


model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/phi-2", 
    trust_remote_code=True,
    padding_side="right",
)
tokenizer.pad_token = tokenizer.eos_token

#### Test Base Model

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, 
                max_length=200,
               )
prompt = "What is the capital of India?"
print(pipe(prompt)[0]['generated_text'])

In [None]:
model


PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_l

## Import Data

In [None]:
from datasets import load_dataset

# dataset_name = "timdettmers/openassistant-guanaco"
dataset_name = "OpenAssistant/oasst1"
dataset = load_dataset(dataset_name, split="train").select(range(5000))

In [None]:
dataset


Dataset({
    features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
    num_rows: 5000
})



In [None]:
print("Text >> ", next(iter(dataset))["text"])
print("Role >> ", next(iter(dataset))["role"])


Text >>  Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.
Role >>  prompter



## Fine-tune Model

### Set SFT Config

In [None]:
# Preprocessing Function
def preprocess_function(examples):
    # Combine role and text, filter out empty entries
    texts = [
        f"### {role}: {text}" 
        for role, text in zip(examples['role'], examples['text']) 
        if role and text
    ]
    
    # Tokenize the texts
    tokenized = tokenizer(
        texts, 
        truncation=True, 
        padding=True,  # Use dynamic padding
        max_length=512,
        return_tensors='pt'
    )
    
    return tokenized

In [None]:
# Prepare the dataset
processed_dataset = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=dataset.column_names,
    desc="Preprocessing dataset"
)

In [None]:
from peft import LoraConfig

lora_alpha = 128
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2",
    ]
)

In [None]:
from transformers import TrainingArguments

output_dir = "/results"
per_device_train_batch_size = 8
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 20
logging_steps = 1
learning_rate = 1e-3
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps, # will change this later on
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
    report_to="none",  # Disable wandb or other tracking
    log_level="info",  # Increase logging verbosity
)

### Finetuning

In [None]:
from trl import SFTTrainer

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=processed_dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,
)

In [None]:
batch = next(iter(trainer.get_train_dataloader()))
print(batch)


{'input_ids': tensor([[21017,  1552, 42104,  ..., 50256, 50256, 50256],
        [21017,  8796,    25,  ..., 50256, 50256, 50256],
        [21017,  8796,    25,  ..., 50256, 50256, 50256],
        ...,
        [21017,  1552, 42104,  ..., 50256, 50256, 50256],
        [21017,  8796,    25,  ..., 50256, 50256, 50256],
        [21017,  1552, 42104,  ..., 50256, 50256, 50256]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([[21017,  1552, 42104,  ...,  -100,  -100,  -100],
        [21017,  8796,    25,  ...,  -100,  -100,  -100],
        [21017,  8796,    25,  ...,  -100,  -100,  -100],
        ...,
        [21017,  1552, 42104,  ...,  -100,  -100,  -100],
        [21017,  8796,    25,  ...,  -100,  -100,  -100],
        [21017,  1552, 42104,  ...,  -

In [None]:
# Pre-process the model by upcasting the layer norms in float 32 for more stable training

for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train()

## Inference

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel, PeftConfig

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load the base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

In [None]:
model_path = "results/checkpoint-300"

# Load the LoRA adapter weights
ft_model = PeftModel.from_pretrained(
    base_model,
    model_path,
    device_map="auto"
)

In [None]:
from transformers import pipeline

prompt = "What is the square root of 81?"
pipe = pipeline(task="text-generation", model=ft_model, tokenizer=tokenizer)
print(pipe(prompt)[0]['generated_text'])