In [None]:
import torch
print(torch.__version__)

In [None]:
print(torch.cuda.is_available())

In [None]:
!pip install tensorboardX

In [None]:
!pip install -U transformers datasets accelerate peft trl bitsandbytes wandb

In [None]:
!pip install huggingface-cli

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "DPOCodeLLama3-8B"
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
dataset_name = "nickrosh/Evol-Instruct-Code-80k-v1"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000))

In [None]:
dataset[:2]

In [None]:
def prep_data(samples):
    insts = samples['instruction']
    outputs = samples['output']
    rephrased_text = []

    for i in range(len(samples)-1):
        rephrased_text.append(f"<s>[INST] {insts} [/INST] {outputs} </s>")

    return {"text": " ".join(rephrased_text)}

transformed_dataset  =  dataset.map(prep_data)

In [None]:
transformed_dataset[1]

In [None]:
transformed_dataset.push_to_hub("EterG/Evol-Instruct-Code-1000-llama")

In [None]:
dataset = transformed_dataset.remove_columns(['instruction','output'])
dataset

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.08,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
device_map = {}
for layer_num in range(80):
    device_map[f"model.layers.{layer_num}.input_layernorm.weight"] = "cpu"
    device_map[f"model.layers.{layer_num}.mlp.down_proj.weight"] = "cpu"
    device_map[f"model.layers.{layer_num}.mlp.gate_proj.weight"] = "cpu"
    device_map[f"model.layers.{layer_num}.mlp.up_proj.weight"] = "cpu"
    device_map[f"model.layers.{layer_num}.post_attention_layernorm.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.k_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.o_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.q_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.v_proj.weight"] = 0

device_map["model.embed_tokens.weight"] = 0
device_map["lm_head.weight"]="cpu"
device_map["model.norm.weight"]="cpu"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map = device_map,
    attn_implementation = "eager",
    resume_download = True,
    output_hidden_states = True #check cuda compute capability version and set it to flash attention if it's above 8
)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

In [None]:
training_arguments = TrainingArguments(
    output_dir="/outputs",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=20,
    logging_steps=20,
    learning_rate=5e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.25,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

trainer.train()

trainer.model.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"