# Training

### Load Dataset

In [3]:
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
    LlamaForCausalLM,
)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
    prepare_model_for_kbit_training,
    PeftConfig,
    PeftModel
)
from accelerate import Accelerator
import functools

import torch

In [4]:
import os
from datasets import load_from_disk, load_dataset, concatenate_datasets
#dataset_train = load_dataset('ASSERT-KTH/DISL', 'invariants-infillings', cache_dir=os.environ.get('TMPDIR'), streaming=True, split='train')
#dataset_test = load_dataset('ASSERT-KTH/DISL', 'invariants-infillings', cache_dir=os.environ.get('TMPDIR'), streaming=True, split='test')

dataset = load_dataset('GGmorello/FLAMES_only_predicates', cache_dir=os.environ.get('TMPDIR'), num_proc = 16)
dataset

Resolving data files:   0%|          | 0/152 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/152 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/157 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/29 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/29 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['original_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2681865
    })
    val: Dataset({
        features: ['original_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 335927
    })
    test: Dataset({
        features: ['original_idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 334533
    })
})

In [5]:
accelerator = Accelerator()
device = accelerator.device
device

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


device(type='cuda')

In [6]:
import os

base_model = "meta-llama/CodeLlama-7b-hf"

model = LlamaForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    ),
    low_cpu_mem_usage = True,
    cache_dir=os.environ.get('TMPDIR'),
    device_map="auto"
)

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
from peft import prepare_model_for_kbit_training

model.train()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "v_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(model, config)


In [8]:
MAX_SEQ_LEN = 4096

tokenizer = AutoTokenizer.from_pretrained(base_model,
                                         use_fast=True,)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print_trainable_parameters(peft_model)

trainable params: 4194304 || all params: 3504738304 || trainable%: 0.1196752406652728


In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)


In [11]:
def compute_metrics(eval_preds, tokenizer):
    metric = evaluate.load('exact_match')
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    # Replace -100s in the labels as we can't decode them
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {'exact_match': result['exact_match']} 

compute_metrics_fn = functools.partial(compute_metrics, tokenizer=tokenizer)


In [12]:
os.environ["WANDB_PROJECT"]="FLAMES_no_comment"
os.environ["WANDB_NOTEBOOK_NAME"]="Training"

In [13]:
from transformers import TrainingArguments, Trainer
from datetime import datetime

output_dir = "/mimer/NOBACKUP/groups/naiss2024-23-121/morello/training_500k"
batch_size = 32
per_device_train_batch_size = 2
gradient_accumulation_steps = batch_size // per_device_train_batch_size


training_args = TrainingArguments(
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=3e-4,
    num_train_epochs = 1,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="no",
    save_strategy="steps",
    output_dir=output_dir,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    report_to="wandb",
)
    
trainer = Trainer(
    model=peft_model,
    train_dataset=dataset['train'].select(range(500000)),
    eval_dataset=dataset['val'].select(range(20000)),
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics_fn,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
import sys 

trainer.train()
trainer.save_state()
trainer.save_model(output_dir=output_dir)
tokenizer.save_pretrained(save_directory=output_dir)

[34m[1mwandb[0m: Currently logged in as: [33mggmorello[0m ([33mggmorello-org[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


In [None]:
model.push_to_hub('GGmorello/FLAMES-100k', token='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')
