In [2]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score

In [3]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login

interpreter_login()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): ··········
Add token as git credential? [y/N]: y


In [4]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"

In [5]:
huggingface_dataset_name = "tatsu-lab/alpaca"
dataset = load_dataset(huggingface_dataset_name)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [6]:
dataset['train'][0]

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}

In [7]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [8]:
model_name='google/gemma-2-2b'
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                      device_map=device_map,
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True)

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
print(f"Tokenizer pad_token (before assignment): {tokenizer.pad_token}")
print(f"Tokenizer pad_token_id (before assignment): {tokenizer.pad_token_id}")

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Tokenizer pad_token (before assignment): <pad>
Tokenizer pad_token_id (before assignment): 0


In [None]:
# tokenizer.pad_token = tokenizer.eos_token
# print(f"Tokenizer pad_token (after assignment): {tokenizer.pad_token}")
# print(f"Tokenizer pad_token_id (after assignment): {tokenizer.pad_token_id}")
# print(f"Tokenizer pad_token: {tokenizer.pad_token}")
# print(f"Tokenizer pad_token_id: {tokenizer.pad_token_id}")

In [10]:
def gen(model,formatted_prompt, maxlen=250, sample=True):
  toks = tokenizer(formatted_prompt, return_tensors="pt")
  # print(toks)
  res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1,temperature=0.1,num_beams=1,top_p=0.95,).to('cpu')
  return tokenizer.batch_decode(res,skip_special_tokens=True)

In [28]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

index = 20

# Alpaca fields
instruction = dataset['train'][index]['instruction']
input_text = dataset['train'][index]['input']        # may be empty string
expected_output = dataset['train'][index]['output']

# Alpaca prompt format (same as the 'text' field in dataset)
if input_text:
    formatted_prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
else:
    formatted_prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n"

res = gen(original_model, formatted_prompt, 100)

# Split on '### Response:\n' to extract only the generated part
output = res[0].split('### Response:\n')[1]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(dash_line)
print(f'BASELINE EXPECTED OUTPUT:\n{expected_output}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in Gemma2DecoderLayer. Setting `past_key_values=None`.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What does DNA stand for?

### Response:

---------------------------------------------------------------------------------------------------
BASELINE EXPECTED OUTPUT:
DNA stands for deoxyribonucleic acid.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
DNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNADNA
CPU times: user 10.6 s, sys: 897 µs, total: 10.6 s
Wall time: 10.6 s


In [None]:


from functools import partial

def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    print("Preprocessing dataset...")

    # Directly tokenize — no need to create 'text', it already exists
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['instruction', 'input', 'output', 'text'],  # remove all 4 columns
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)
    return dataset

In [13]:
## Pre-process dataset
max_length = get_max_length(original_model)
print(max_length)

train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
# eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['validation'])

Found max lenth: 8192
8192
Preprocessing dataset...


Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Filter:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [14]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# 2 - Using the prepare_model_for_kbit_training method from PEFT
# Preparing the Model for QLoRA
original_model = prepare_model_for_kbit_training(original_model)

In [None]:
# Print all named modules to find target_modules for Gemma
for name, module in original_model.named_modules():
    print(name)

In [15]:
print(original_model.config.hidden_size)

2304


In [16]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=16, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'o_proj'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)

In [17]:
# Split dataset → for approx 46k train samples
split = train_dataset.train_test_split(test_size=0.2000, seed=42)
train_dataset = split['train']
eval_dataset  = split['test']   # ← this becomes your eval

print(f"Train size: {len(train_dataset)}")
print(f"Eval size:  {len(eval_dataset)}")

Train size: 41601
Eval size:  10401


In [20]:
import time
import transformers
from transformers import TrainingArguments

output_dir = f'./peft-General-instruction-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=500,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    weight_decay=0.01,
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    bf16=True,
    max_grad_norm=0.3,

    logging_steps=25,
    logging_strategy="steps",
    logging_dir="./logs",

    save_strategy="steps",
    save_steps=25,

    # ❌ removed eval_steps
    # ❌ removed eval_strategy
    # ❌ removed do_eval
    # ❌ removed load_best_model_at_end
    # ❌ removed metric_for_best_model

    report_to="none",
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    # ❌ removed eval_dataset
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [21]:
peft_trainer.train()
# peft_trainer.evaluate()


# # Train
# peft_trainer.train()

# # View loss table after training
# import pandas as pd

# log_history = peft_trainer.state.log_history

# # Separate train and eval logs
# train_logs = [(x['step'], x['loss']) for x in log_history if 'loss' in x]
# eval_logs  = [(x['step'], x['eval_loss']) for x in log_history if 'eval_loss' in x]

# # Merge into DataFrame
# train_df = pd.DataFrame(train_logs, columns=['step', 'train_loss'])
# eval_df  = pd.DataFrame(eval_logs,  columns=['step', 'eval_loss'])

# result = pd.merge(train_df, eval_df, on='step')
# print(result.to_string(index=False))

Step,Training Loss
25,2.14958
50,1.374081
75,1.086088
100,1.085291
125,1.037944
150,1.076375
175,1.038104
200,1.020547
225,1.036275
250,1.111446


TrainOutput(global_step=500, training_loss=1.1128813705444336, metrics={'train_runtime': 3022.796, 'train_samples_per_second': 0.662, 'train_steps_per_second': 0.165, 'total_flos': 2644985069895168.0, 'train_loss': 1.1128813705444336, 'epoch': 0.048075767409437274})

In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model_id = "google/gemma-2-2b"
base_model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True)
                                                      # use_auth_token=True)

Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

In [23]:
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [24]:
from peft import PeftModel
import os

ft_model = PeftModel.from_pretrained(base_model, os.path.join(output_dir, "checkpoint-500"), torch_dtype=torch.float16, is_trainable=False)

In [27]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

index = 20

# Alpaca fields
instruction = dataset['train'][index]['instruction']
input_text = dataset['train'][index]['input']        # may be empty string
expected_output = dataset['train'][index]['output']

# Alpaca prompt format (same as the 'text' field in dataset)
if input_text:
    formatted_prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
else:
    formatted_prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n"

res = gen(ft_model, formatted_prompt, 100)

# Split on '### Response:\n' to extract only the generated part
output = res[0].split('### Response:\n')[1]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(dash_line)
print(f'BASELINE EXPECTED OUTPUT:\n{expected_output}\n')
print(dash_line)
print(f'PEFT MODEL GENERATED :\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What does DNA stand for?

### Response:

---------------------------------------------------------------------------------------------------
BASELINE EXPECTED OUTPUT:
DNA stands for deoxyribonucleic acid.

---------------------------------------------------------------------------------------------------
PEFT MODEL GENERATED :
DNA stands for Deoxyribonucleic Acid.
CPU times: user 1.3 s, sys: 0 ns, total: 1.3 s
Wall time: 2.28 s


In [None]:
original_model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True)

Loading weights:   0%|          | 0/453 [00:00<?, ?it/s]

In [None]:
import pandas as pd

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    human_baseline_text_output = human_baseline_summaries[idx]
    prompt = f"Instruct: Summarize the following conversation.\n{dialogue}\nOutput:\n"

    original_model_res = gen(original_model,prompt,100,)
    original_model_text_output = original_model_res[0].split('Output:\n')[1]

    peft_model_res = gen(ft_model,prompt,100,)
    peft_model_output = peft_model_res[0].split('Output:\n')[1]
    print(peft_model_output)
    peft_model_text_output, success, result = peft_model_output.partition('###')

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

In [None]:
import evaluate

rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')