# Training

### Load Dataset

In [1]:
import os
from datasets import load_from_disk, load_dataset, concatenate_datasets
#dataset_train = load_dataset('ASSERT-KTH/DISL', 'invariants-infillings', cache_dir=os.environ.get('TMPDIR'), streaming=True, split='train')
#dataset_test = load_dataset('ASSERT-KTH/DISL', 'invariants-infillings', cache_dir=os.environ.get('TMPDIR'), streaming=True, split='test')

dataset = load_from_disk('/mimer/NOBACKUP/groups/naiss2024-23-121/morello/dataset_train_full')

Loading dataset from disk:   0%|          | 0/255 [00:00<?, ?it/s]

In [2]:
ds = dataset.train_test_split()

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['comment', 'input', 'label', 'original_idx', 'predicate', 'len'],
        num_rows: 3964493
    })
    test: Dataset({
        features: ['comment', 'input', 'label', 'original_idx', 'predicate', 'len'],
        num_rows: 1321498
    })
})

In [4]:
dataset_train = ds['train'].to_iterable_dataset()
dataset_test = ds['test'].to_iterable_dataset()

In [5]:
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
    LlamaForCausalLM,
)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
    prepare_model_for_kbit_training
)
import torch

In [6]:
import os

base_model = "codellama/CodeLlama-7b-hf"

model = LlamaForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    ),
    low_cpu_mem_usage = True,
    cache_dir=os.environ.get('TMPDIR'),
)




config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
from peft import prepare_model_for_kbit_training

model.train()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(model, config)


In [8]:
MAX_SEQ_LEN = 16384/4

tokenizer = AutoTokenizer.from_pretrained(base_model,
                                         truncation=False,
                                         model_max_length=MAX_SEQ_LEN,
                                         padding_side="left",
                                         use_fast=True,
                                         )

## Tokenization


In [9]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0

def tokenize(text, tokenizer, max_seq_len=MAX_SEQ_LEN):
    result = tokenizer(
        text,
        truncation=False,
        max_length=max_seq_len,
        padding=False,
        return_tensors=None,
    )
    if len(result['input_ids']) > max_seq_len:
        return None
        
    return result
    
def generate_and_tokenize_prompt(sample):
    input_text = sample['input']
    target_text = sample['label']
    
    tokenized_input_text = tokenize(input_text, tokenizer, max_seq_len=MAX_SEQ_LEN)
    tokenized_target_text = tokenize(target_text, tokenizer, max_seq_len=MAX_SEQ_LEN)

    if tokenized_input_text is None or tokenized_target_text is None:
        return  {"input_ids": None, "attention_mask": None, "labels": None}
    else:
        return {'input_ids': tokenized_input_text['input_ids'][:-1] + tokenized_target_text['input_ids'][1:], 
                  'attention_mask': [1] * len(tokenized_input_text['input_ids'][:-1] + tokenized_target_text['input_ids'][1:]),
                  'labels': [-100] * len(tokenized_input_text['input_ids'][:-1]) + tokenized_target_text['input_ids'][1:]
                }

In [10]:
train_tokenized = dataset_train.map(generate_and_tokenize_prompt, 
                                remove_columns=dataset_train.column_names,
                      )
test_tokenized = dataset_test.map(generate_and_tokenize_prompt, 
                                remove_columns=dataset_test.column_names,
                      )


In [11]:
train_tokenized = train_tokenized.filter(lambda sample: sample["input_ids"] is not None)
test_tokenized = test_tokenized.filter(lambda sample: sample["input_ids"] is None)

In [14]:
s = next(iter(train_tokenized))

In [17]:
tokenizer.decode(s['input_ids'])



In [30]:
input1 = "I have a big dog."
input2 = "I love it."
full_text = input1 + input2 + tokenizer.eos_token
full_text

'I have a big dog.I love it.</s>'

In [33]:
tokenized_full_text = tokenize(full_text, tokenizer)
tokenized_input_text = tokenize(input1, tokenizer)

input_len = len(tokenized_input_text)



TypeError: can't multiply sequence by non-int of type 'list'

{'input_ids': [1, 32007, 306, 505, 263, 29871, 32008, 11203, 29889, 29902, 5360, 372, 29889, 2, 32009, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [45]:
inputs = tokenizer(s.splitlines(), return_tensors="pt", padding=True)


In [47]:
from peft import replace_lora_weights_loftq

current_mse = float("inf")

def my_callback(model, module_name):
    """Callable to replace weights with LoFTQ if the mse is lower than the current best one."""
    global current_mse

    logits = model(**inputs).logits
    mse = get_mse(logits_base, logits)
    if mse < current_mse:
        current_mse = mse
        print(f"MSE improved for module {module_name}")
        return True
    print(f"MSE did not improve for module {module_name}")
    return False


replace_lora_weights_loftq(peft_model)

KeyboardInterrupt: 

In [48]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print_trainable_parameters(model)

trainable params: 4456448 || all params: 3505000448 || trainable%: 0.12714543310666077


In [49]:
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)


In [50]:
import functools

# Evaluation metrics
def compute_metrics(eval_preds, tokenizer):
    metric = evaluate.load('exact_match')
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    # Replace -100s in the labels as we can't decode them
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {'exact_match': result['exact_match']} 

compute_metrics_fn = functools.partial(compute_metrics, tokenizer=tokenizer)


In [54]:
from transformers import TrainingArguments, Trainer
from datetime import datetime

output_dir = "/mimer/NOBACKUP/groups/naiss2024-23-121/morello/training"


training_args = TrainingArguments(
    optim='adamw_torch',
    do_eval = False,
    fp16 = True,
    num_train_epochs = 2,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 1,
    evaluation_strategy = "no",
    eval_steps = 10,
    save_steps = 150,
    learning_rate = 5e-4,
    lr_scheduler_type = "cosine",
    logging_steps = 10,
    ddp_find_unused_parameters = False,
    output_dir = "/mimer/NOBACKUP/groups/naiss2024-23-121/morello/training",
    max_steps = 5000,
)
    
trainer = Trainer(
    model=peft_model,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    args=training_args,
    data_collator=data_collator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [55]:
import sys 

model.config.use_cache = True

trainer.train()
trainer.save_state()
trainer.save_model(output_dir=output_dir)
tokenizer.save_pretrained(save_directory=output_dir)




Step,Training Loss




KeyboardInterrupt: 

In [None]:
model.push_to_hub('GGmorello/FLAMES', token='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')


In [None]:
not