In [1]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [2]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The Gemma model follows the template format as outlined below:
```<start_of_turn>
user How does the brain work?
<end_of_turn> <start_of_turn>model```

In [6]:
text = """<start_of_turn>user
what is purr-data?<end_of_turn>
<start_of_turn>model"""
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
what is purr-data?
modelSure, here's a breakdown of what purr-data is:

**Purr-data** is a collection of **1.5 million curated datasets** on cats and kittens, spanning over 100 different topics. These datasets are sourced from various online sources and curated by a team of data scientists at the University of California, Berkeley.

**Key features of purr-data:**

* **Large size:** It includes data on cats and kittens from 201


In [7]:
os.environ["WANDB_DISABLED"] = "true"

# LoRA config

In [8]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

### loading custom dataset: https://huggingface.co/datasets/ParZiVal04/Purr-Data_example_source_codes

In [9]:
from datasets import load_dataset

data = load_dataset("ParZiVal04/Purr-Data_example_source_codes")

In [10]:
data

DatasetDict({
    train: Dataset({
        features: ['Instruction', 'Response'],
        num_rows: 775
    })
})

In [11]:
print(f"""input: {data['train']['Instruction'][45]}\noutput:\n{data['train']['Response'][45]}""")


input: write Purr-Data source code to print a surprising fact about the economy on bang click
output:
#N canvas 761 0 768 809 10;
#X obj 260 170 bng 15 250 50 0 empty empty empty 17 7 0 10 #fcfcfc #000000 #000000;
#X msg 472 86 More Monopoly money is printed each year than real currency.;
#X obj 155 159 print;
#X connect 0 0 1 0;
#X connect 1 0 2 0;


In [12]:
def formatting_func(example):
    text = f"<start_of_turn>user\n{example['Instruction'][0]}<end_of_turn> <start_of_turn>model\n{example['Response'][0]}<end_of_turn>"
    return [text]

In [13]:
import transformers
from trl import SFTTrainer

In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=150,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
trainer.train()

Step,Training Loss
1,1.3517
2,1.3517
3,1.2538
4,1.0916
5,0.9658
6,0.8621
7,0.7714
8,0.6938
9,0.6194
10,0.5468


TrainOutput(global_step=150, training_loss=0.10874262704203526, metrics={'train_runtime': 63.6231, 'train_samples_per_second': 9.431, 'train_steps_per_second': 2.358, 'total_flos': 281425830912000.0, 'train_loss': 0.10874262704203526, 'epoch': 150.0})

## Testing the model

In [16]:
text = """<start_of_turn>user
Write a purr-data patch to print out a message on bang click.<end_of_turn>
<start_of_turn>model"""
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=175)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
Write a purr-data patch to print out a message on bang click.
model
#N canvas 761 0 768 809 10;
#X obj 260 170 bng 15 250 50 0 empty empty empty 17 7 0 10 #fcfcfc #000000 #000000;
#X msg 40 43 Bang click!;
#X obj 234 107 print;
#X connect 0 0 1 0;
#X connect 1 0 2 0;
#X connect 13 4; fazia;lepiej;Zunanje;
#X connect 0 0 15 17;
#X connect 17 0
