In [1]:
!pip install transformers peft accelerate datasets bitsandbytes torch trl -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [5]:
from datasets import load_dataset

instruct_tune_dataset = load_dataset("mosaicml/instruct-v3")
instruct_tune_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 56167
    })
    test: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 6807
    })
})

In [6]:
instruct_tune_dataset = instruct_tune_dataset.filter(lambda x: x["source"] == "dolly_hhrlhf")
instruct_tune_dataset

Filter:   0%|          | 0/56167 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6807 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 34333
    })
    test: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 4771
    })
})

In [7]:
instruct_tune_dataset['train'] = instruct_tune_dataset['train'].select(range(5_000))

In [8]:
instruct_tune_dataset['test'] = instruct_tune_dataset['test'].select(range(200))

In [9]:
instruct_tune_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 200
    })
})

## Create formatted prompt

In [None]:
def create_prompt(sample):
  bos_token = "<s>"
  original_system_message = "Below is an instruction that describes a task. Write an appropriate "
  system_message = "Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  response = sample["prompt"].replace(original_system_message, "").replace("\n\n### Instruction\n", "").replace()
  input = sample["response"]
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += "### Instruction:"
  full_prompt += "\n" + system_message
  full_prompt += "\n\n ### Input"
  full_prompt += "\n" + input
  full_prompt += "\n\n ### Response:"
  full_prompt += "\n" + response
  full_prompt += eos_token

  return full_prompt

## Loading the base model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-instruct-v0.1",
    device_map = 'auto',
    quantization_config = nf4_config,
    use_cache = False
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-instruct-v0.1")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## Let's run an example with base model

In [None]:
def generate_response(prompt):
  encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to("cuda")

  generated_ids = model.generate(**model_inputs, max_new_tokens = 1000, do_sample=True, pad_token_id = tokenizer)
  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [None]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type= "CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)
lora_model = get_peft_model(model, peft_config)

In [None]:
lora_model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir = "./mistral_instruct_generation",
    num_train_epochs = 10,
    # max_steps = 100, -> uncomment it if you do not want to train on epochs
    per_device_train_batch_size = 4,
    warmup_steps = 0.03,
    logging_steps = 10,
    save_strategy = 'epochs',
    evaluation_strategy = 'epochs',
    # evaluation_strategy = 'steps',
    # eval_steps = 20, -> uncomment it if you do not want to evaluate at the end of each epoch
    learning_rate = 2e-4,
    bf16= True,
    lr_scheduler_type = "constant"
)

In [None]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
    model = lora_model,
    peft_config = peft_config,
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    packing = True,
    formatting_func = create_prompt,
    args = args,
    train_dataset = instruct_tune_dataset['train'],
    eval_dataset = instruct_tune_dataset['test']
)

In [None]:
trainer.train()

In [None]:
# to save model locally
trainer.save_model("mistral_instruct_generation")

## Push model to hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
trainer.push_to_hub("itsasjad/mistral_instruct_generation")

In [None]:
# merge and unload
merged_model = model.merge_and_unload()