In [None]:
from unsloth import FastLanguageModel
import os
import torch
max_seq_length = 2048 # Qualsiasi valore, dato che viene effettuato RoPE Scaling in automatico.
dtype = None # None per auto detection.
load_in_4bit = True # Carica il modello quantizzato in 4bit.
lora_rank = 64 # Larger rank = smarter, but slower

hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
    max_lora_rank=lora_rank
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_rank,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# Alpaca-style formatting using AceMath + Code (10k each), materialized to regular datasets
from datasets import load_dataset, concatenate_datasets, Dataset

# Config
MATH_DATASET = "nvidia/OpenMathReasoning"
MATH_SPLIT = "cot"
CODE_DATASET = "TokenBender/code_instructions_122k_alpaca_style"
SEED = 42
N_PER_DATASET = 2_000

# Template (Alpaca-style)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 1) Load AceMath streaming, first 10k, then materialize
math_stream = load_dataset(MATH_DATASET, split=MATH_SPLIT, streaming=True)
math_stream = math_stream.take(N_PER_DATASET)
math_list = list(math_stream)  # materialize to avoid IterableColumn
math_ds = Dataset.from_list(math_list)

# Map AceMath -> Alpaca fields
math_ds = math_ds.map(
    lambda x: {
        "instruction": x.get("problem", ""),
        "input": "",
        "output": x.get("generated_solution", ""),
    },
    remove_columns=[c for c in math_ds.column_names if c not in ["instruction", "input", "output"]],
)

# 2) Load Code dataset streaming, take 10k, materialize
code_stream = load_dataset(CODE_DATASET, split="train", streaming=True)
code_stream = code_stream.take(N_PER_DATASET)
code_list = list(code_stream)
code_ds = Dataset.from_list(code_list)

code_ds = code_ds.map(
    lambda x: {
        "instruction": x["instruction"],
        "input": x.get("input", ""),
        "output": x["output"],
    },
    remove_columns=[c for c in code_ds.column_names if c not in ["instruction", "input", "output"]],
)

# 3) Merge and format as regular dataset
merged = concatenate_datasets([math_ds, code_ds]).shuffle(seed=SEED)

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = [alpaca_prompt.format(i, (j or ""), k) + EOS_TOKEN for i, j, k in zip(instructions, inputs, outputs)]
    return {"text": texts}

formatted = merged.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=merged.column_names,
)

# 4) Info + preview
print("\nPreview (first 600 chars):\n")
print(formatted[0]["text"][:600])

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted,
    dataset_text_field = "text", # Specifica il nome del campo nel dataset in cui Ã¨ archiviato il testo di input
    max_seq_length = max_seq_length,
    dataset_num_proc = 2, # Numero di processi da utilizzare per la pre-elaborazione del dataset
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size = 4, # Numero di training sample per batch per device (GPU/CPU)
        gradient_accumulation_steps = 4, # Il numero di step per accumulare i gradienti prima di eseguire un backward pass.
        max_grad_norm=1.0,
        warmup_ratio=0.05, # Numero di step iniziali durante i quali il learning rate aumenta linearmente da 0 al valore impostato.
        num_train_epochs = 1,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10, # Con quale frequenza (in termini di step) devono essere stampati i log di training.
        optim = "adamw_8bit", # Optimizer utilizzato
        weight_decay = 0.01,
        lr_scheduler_type = "cosine", # Tipo di learning rate scheduler da utilizzare
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Implement a algorithm in Python for sorting two large lists A and B. The algorithm should take the first element of list A and compare it with the first element of list B. If it is greater, then it should add the element to the result list and move the pointer of B to the next element. If the element of A is smaller, the pointer should move to the next element of A, and so on.", # instruction
        "A = [3, 6, 8, 10, 11] \nB = [2, 5, 7, 12]", # input
        "", # output
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer)

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        """For the natural number A, the quotient of A divided by 9 is 6 and the remainder is 5. What is the value of A?""", # instruction
        "", # input
        "", # output
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer)

In [None]:
output_repo = "Alelcv27/Llama3.1-8B-DataMerged"

model.push_to_hub_merged(output_repo, tokenizer, save_method = "merged_16bit", token = token)