In [1]:
import torch
import json
import huggingface_hub
import datasets
from transformers import GemmaTokenizer, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from accelerate import Accelerator, DataLoaderConfiguration
from transformers import DataCollatorForLanguageModeling

In [2]:
# Load the tokenizer and model
gpu = torch.device('cuda:0')
model_id = "google/codegemma-1.1-2b"
tokenizer = GemmaTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=gpu, torch_dtype=torch.bfloat16)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu_pytorch_tanh`, edit the `model.config` to set `hidden_activation=gelu_pytorch_tanh`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def generate(prompt, max_new_tokens=200):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(gpu)
    outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0])

In [4]:
print(generate("<|fim_prefix|>def add(x, y) -> int:\n\n    \"\"\"<|fim_suffix|>\"\"\"\n        return x + y<|fim_middle|>"))

<bos><|fim_prefix|>def add(x, y) -> int:

    """<|fim_suffix|>"""
        return x + y<|fim_middle|>
    This function adds two numbers and returns the result.

    Args:
        x (int): The first number to be added.
        y (int): The second number to be added.

    Returns:
        int: The sum of the two input numbers.
    <|file_separator|><eos>


In [5]:
ds = datasets.load_dataset('json', split='train', data_files='train_data.json')
#print(ds[0]['e'])

In [6]:
def tokenize(dataset_row):
    source = dataset_row['e']
    input_ids = tokenizer.encode(source) + [tokenizer.eos_token_id]
    labels = input_ids.copy()
    
    return {
        'input_ids': input_ids,
        'labels': labels
    }

In [7]:
tokenized_data = ds.map(tokenize, remove_columns=['l', 'd', 'c', 'e'])

In [8]:
print(tokenized_data)

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 40000
})


In [9]:
def block(data, block_size=512):

    blocked_ids = [[]]
    current_block = 0
    current_block_capacity = block_size
    
    for i in range(len(data['input_ids'])):
        example = data['input_ids'][i]
        if(len(example) > block_size):
            continue
        if(current_block_capacity - len(example) >= 0):
            blocked_ids[current_block] = blocked_ids[current_block] + example
            current_block_capacity -= len(example)
        else:
            while(len(blocked_ids[current_block]) < block_size):
                blocked_ids[current_block].append(tokenizer.eos_token_id)
            
            blocked_ids.append([])
            current_block += 1
            current_block_capacity = block_size
    while(len(blocked_ids[current_block]) < block_size):
            blocked_ids[current_block].append(tokenizer.eos_token_id)

    # concatenate all items
    #concatenated = sum(data['input_ids'], [])
    #length = len(concatenated)
           
    # shape "n / block_size" blocks
    #truncated_length = (length // block_size) * block_size
    #blocked_ids = [concatenated[i : i + block_size] for i in range(0, truncated_length, block_size)]

    # add last block with padding
    #pad_length = block_size - (length % block_size)  # remaining tokens to fill
    #if pad_length != block_size:
    #    blocked_ids += [concatenated[truncated_length:] + [tokenizer.eos_token_id] * pad_length]

    # format as transformers-friendly model input
    assert len(blocked_ids) > 0
    return {
        'input_ids': blocked_ids,
        'labels': blocked_ids.copy()}

In [10]:
split_dataset = tokenized_data.train_test_split(
    test_size = 0.1,
    shuffle = True,
    seed = 421337)
test_data = split_dataset['test']
train_data = split_dataset['train']

In [11]:
test_data_blocks = test_data.map(block, batched=True)
train_data_blocks = train_data.map(block, batched=True)

In [1]:
print(test_data_blocks)

NameError: name 'test_data_blocks' is not defined

In [12]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import get_linear_schedule_with_warmup, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from tqdm import tqdm

In [13]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # We want to predict the next token
    inference_mode=False,
    target_modules=['q_proj', 'v_proj'],   # this is specific to each model! Look up the exact names in the model
    r=8,  # rank
    lora_alpha=16,  # how strong does it override old values
    lora_dropout=0.05,  # to help with generalization, 5% of updates to the LoRA weights are discarded
    bias="all"
)

In [14]:
# This wraps our LLM into the adapter model
peft_model = get_peft_model(model, peft_config)

In [15]:
# Check how many paramaters we need to train (should be orders of magnitude fewer)
peft_model.print_trainable_parameters()

trainable params: 921,600 || all params: 2,507,094,016 || trainable%: 0.036759690467068624


In [16]:
batch_size = 4
num_epochs = 10

In [17]:
# some book-keeping to make sure any padding happens with <EOS> tokens (not all tokenizers are meant to be used in fine-tuning and don't always have this right)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

# special collator that takes care of correctly offsetting our data (so that token n predicts n+1) 
collator = DataCollatorForLanguageModeling(
            tokenizer,
            mlm=False,  # we could also "mask" random tokens to introduce some noise, but we don't need that here
            pad_to_multiple_of=8,
            return_tensors="pt")

In [18]:
train_dataloader = DataLoader(train_data_blocks, shuffle=True, collate_fn=collator, batch_size=batch_size)
eval_dataloader = DataLoader(test_data_blocks, shuffle=True, collate_fn=collator, batch_size=batch_size)

In [19]:
lr = 3e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)  # The AdamW optimizer is well-suited for LLMs

# our schedule will decrease learning rate linearly with time
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
for epoch in range(num_epochs):
    model.train()  # set to training mode
    total_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader)):
        outputs = model(**batch.to(gpu))  # run batch through model
        loss = outputs.loss
        total_loss += loss.detach().cpu().float()
        loss.backward()      # propagate loss back
        optimizer.step()     # update weights
        lr_scheduler.step()  # update learning rate
        optimizer.zero_grad()

    model.eval()  # set to evaluation mode
    eval_loss = 0
    for step, batch in enumerate(tqdm(eval_dataloader)):
        with torch.no_grad():
            outputs = model(**batch.to(gpu))
        loss = outputs.loss
        eval_loss += loss.detach().cpu().float()

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    train_epoch_loss = total_loss / len(train_dataloader)
    print(f"{epoch=}: {train_epoch_loss=} {eval_epoch_loss=}")

    # save adapter to be loaded later
    peft_model.save_pretrained(f'./finetuning_checkpoints/checkpoint-ep{epoch}')

 16%|█▌        | 274/1766 [10:17<56:07,  2.26s/it]  

In [None]:
# print(generate("<|fim_prefix|>def add(x, y) -> int:\n\n    \"\"\"<|fim_suffix|>\"\"\"\n        return x + y<|fim_middle|>"))

In [24]:
checkpoint = "finetuning_checkpoints/checkpoint-ep9"
finetuned_model = AutoModelForCausalLM.from_pretrained(checkpoint).to(gpu)

def finetuned_generate(prompt, max_new_tokens=200):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(gpu)
    outputs = finetuned_model.generate(inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0])

print(finetuned_generate("<|fim_prefix|>def add(x, y) -> int:\n\n    \"\"\"<|fim_suffix|>\"\"\"\n        return x + y<|fim_middle|>"))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<bos><|fim_prefix|>def add(x, y) -> int:

    """<|fim_suffix|>"""
        return x + y<|fim_middle|>Add two numbers.<|file_separator|><eos>
