Python Notebook for finetuning our model. Heavily inspired by the finetuning example presented in class (HPI, AI for Software Engineering, 14.05.2024). 

In [None]:
#!pip install transformers
#!pip install accelerate

In [1]:
import torch
import json
import huggingface_hub
import datasets
from transformers import GemmaTokenizer, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from accelerate import Accelerator, DataLoaderConfiguration
from transformers import DataCollatorForLanguageModeling

In [None]:
gpu = torch.device('cuda:0')
model_id = "google/codegemma-1.1-2b"
tokenizer = GemmaTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=gpu, torch_dtype=torch.bfloat16)

In [3]:
def generate(prompt, max_new_tokens=200):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(gpu)
    outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0])

In [4]:
print(generate("<|fim_prefix|>def add(x, y) -> int:\n\n    \"\"\"<|fim_suffix|>\"\"\"\n        return x + y<|fim_middle|>"))

<bos><|fim_prefix|>def add(x, y) -> int:

    """<|fim_suffix|>"""
        return x + y<|fim_middle|>
    This function adds two numbers and returns the result.

    Args:
        x (int): The first number to be added.
        y (int): The second number to be added.

    Returns:
        int: The sum of the two input numbers.
    <|file_separator|><eos>


In [5]:
ds = datasets.load_dataset('json', split='train', data_files='train_data.json')
#print(ds[0]['e'])

In [6]:
def tokenize(dataset_row):
    source = dataset_row['e']
    input_ids = tokenizer.encode(source) + [tokenizer.eos_token_id]
    labels = input_ids.copy()
    
    return {
        'input_ids': input_ids,
        'labels': labels
    }

In [7]:
tokenized_data = ds.map(tokenize, remove_columns=['l', 'd', 'c', 'e'])

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [8]:
print(tokenized_data)

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 40000
})


Adjusted blocking in order to avoid splitting code examples in the middle. If an example does not fit into the current block, pad it with EOS-tokens and put the example into the next block. The block size 512 is larger than our longest example.

In [9]:
def block(data, block_size=512):

    blocked_ids = [[]]
    current_block = 0
    current_block_capacity = block_size
    
    for i in range(len(data['input_ids'])):
        example = data['input_ids'][i]
        if(len(example) > block_size):
            continue
        if(current_block_capacity - len(example) >= 0):
            blocked_ids[current_block] = blocked_ids[current_block] + example
            current_block_capacity -= len(example)
        else:
            while(len(blocked_ids[current_block]) < block_size):
                blocked_ids[current_block].append(tokenizer.eos_token_id)
            
            blocked_ids.append([])
            current_block += 1
            current_block_capacity = block_size
    while(len(blocked_ids[current_block]) < block_size):
            blocked_ids[current_block].append(tokenizer.eos_token_id)
        
    assert len(blocked_ids) > 0
    return {
        'input_ids': blocked_ids,
        'labels': blocked_ids.copy()}

In [10]:
split_dataset = tokenized_data.train_test_split(
    test_size = 0.1,
    shuffle = True,
    seed = 421337)
test_data = split_dataset['test']
train_data = split_dataset['train']

In [11]:
test_data_blocks = test_data.map(block, batched=True)
train_data_blocks = train_data.map(block, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

In [13]:
print(test_data_blocks[0])

{'input_ids': [2, 67, 141, 1293, 4830, 235278, 1053, 235269, 24826, 1245, 109, 141, 1676, 69, 1676, 108, 149, 16045, 589, 2011, 235265, 1353, 235298, 21837, 235298, 4705, 649, 2011, 235265, 21837, 235298, 16045, 235278, 15399, 199872, 235248, 235276, 3013, 108, 145, 15399, 589, 24826, 963, 12568, 108, 145, 15399, 589, 24826, 235265, 67479, 235278, 235274, 235269, 728, 235274, 235275, 108, 145, 746, 517, 575, 2011, 235265, 16116, 235292, 108, 149, 15399, 589, 517, 235278, 15399, 235275, 109, 145, 15399, 589, 2011, 235265, 15842, 235278, 15399, 235265, 67479, 235278, 235274, 235269, 728, 235274, 1269, 108, 145, 773, 24826, 68, 235292, 1728, 24826, 235292, 892, 235305, 235269, 584, 235269, 640, 235307, 108, 235292, 773, 235292, 892, 235305, 235269, 584, 235269, 640, 235307, 70, 1, 2, 67, 1293, 5985, 235298, 1149, 235278, 235272, 1245, 109, 141, 1676, 69, 1676, 108, 145, 773, 1295, 235278, 235272, 235275, 68, 9954, 593, 685, 2067, 70, 1, 2, 67, 141, 1293, 9616, 235278, 1053, 235269, 89167,

In [12]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import get_linear_schedule_with_warmup, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from tqdm import tqdm

In [13]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    target_modules=['q_proj', 'v_proj'],
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="all"
)

In [14]:
peft_model = get_peft_model(model, peft_config)

In [15]:
peft_model.print_trainable_parameters()

trainable params: 921,600 || all params: 2,507,094,016 || trainable%: 0.036759690467068624


In [16]:
batch_size = 4
num_epochs = 10

In [17]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

collator = DataCollatorForLanguageModeling(
            tokenizer,
            mlm=False,
            pad_to_multiple_of=8,
            return_tensors="pt")

In [18]:
train_dataloader = DataLoader(train_data_blocks, shuffle=True, collate_fn=collator, batch_size=batch_size)
eval_dataloader = DataLoader(test_data_blocks, shuffle=True, collate_fn=collator, batch_size=batch_size)

In [19]:
lr = 3e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader)):
        outputs = model(**batch.to(gpu))
        loss = outputs.loss
        total_loss += loss.detach().cpu().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    for step, batch in enumerate(tqdm(eval_dataloader)):
        with torch.no_grad():
            outputs = model(**batch.to(gpu))
        loss = outputs.loss
        eval_loss += loss.detach().cpu().float()

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    train_epoch_loss = total_loss / len(train_dataloader)
    print(f"{epoch=}: {train_epoch_loss=} {eval_epoch_loss=}")


    peft_model.save_pretrained(f'./finetuning_checkpoints/checkpoint-ep{epoch}')

In [None]:
# print(generate("<|fim_prefix|>def add(x, y) -> int:\n\n    \"\"\"<|fim_suffix|>\"\"\"\n        return x + y<|fim_middle|>"))

In [24]:
checkpoint = "finetuning_checkpoints/checkpoint-ep9"
finetuned_model = AutoModelForCausalLM.from_pretrained(checkpoint).to(gpu)

def finetuned_generate(prompt, max_new_tokens=200):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(gpu)
    outputs = finetuned_model.generate(inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0])

print(finetuned_generate("<|fim_prefix|>def add(x, y) -> int:\n\n    \"\"\"<|fim_suffix|>\"\"\"\n        return x + y<|fim_middle|>"))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<bos><|fim_prefix|>def add(x, y) -> int:

    """<|fim_suffix|>"""
        return x + y<|fim_middle|>Add two numbers.<|file_separator|><eos>
