In [None]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset

# for finetuning
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

In [None]:
max_seq_length = 2048 *2 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# Process dataset

In [None]:
def formatting_prompts_func(examples):
    convos = examples["conversation"]
    for conv in convos:
      if len(conv) % 2 and conv[0]["role"]!="system":
        convos.pop()

      if conv[0]["role"] != "system":
        conv.insert(0, {"role": "system", "content": "Eres un asistente en español llamado RaMem, creado por DeepSphereAI y ayudas respondiendo con la mayor exactitud posible."})

    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

# Fine-tuning

## Formating dataset

In [None]:
dataset = load_dataset("json", data_files="./RaMem_dataset.json", split = "train")
# dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

## Create Trainer

In [None]:
trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = 1,
            learning_rate = 2e-4,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "tensorboard",
        ),
    )

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

## Initialize train

In [None]:
trainer_stats = trainer.train()

## Test inference

In [None]:
from transformers import TextStreamer

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                use_cache = True, temperature = 1.5, min_p = 0.1)


# Save model

In [None]:
lora_folder_model  = "../base-RaMem-LoRA"
b16_folder_model = "../base-RaMem"
gguf_folder_model = "../base-RaMem-GGUF"
gguf_8bit_folder_model = "../base-RaMem-GGUF-8bit"
gguf_4bit_folder_model = "../base-RaMem-GGUF-4bit"

# save lora model
model.save_pretrained(lora_folder_model)
tokenizer.save_pretrained(lora_folder_model)

#save merged model
model.save_pretrained_merged(b16_folder_model, tokenizer, save_method = "merged_16bit",)
tokenizer.save_pretrained(b16_folder_model)

# save model gguf
model.save_pretrained_gguf(gguf_folder_model, tokenizer, quantization_method="f16")
tokenizer.save_pretrained(b16_folder_model)

model.save_pretrained_gguf(gguf_8bit_folder_model, tokenizer, quantization_method="q8_0")
tokenizer.save_pretrained(b16_folder_model)

model.save_pretrained_gguf(gguf_4bit_folder_model, tokenizer, quantization_method="q4_k_m")
tokenizer.save_pretrained(b16_folder_model)