# SFT on Llama 4B

The previous notebook works only on the classification but as for translation, we need to give the model the input and output together and then concat them together so that they can learn how to inference it.


In [5]:
import os

os.environ.update(
    {
        # "NCCL_P2P_DISABLE": "1",
        # "NCCL_IB_DISABLE": "1",
        # "TOKENIZERS_PARALLELISM": "false",
        # "CUDA_VISIBLE_DEVICES": "3,2,1,0",
    }
)

import time
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
)
from datasets import DatasetDict, load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from accelerate import Accelerator
from trl import SFTConfig, SFTTrainer
from sacrebleu.metrics import BLEU

from datasets import load_dataset
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

In [6]:
current = time.time()
output_dir = f"logs/fit_{current}"

resume_from_checkpoint = False
# output_dir = "/home/llama/Personal_Directories/srb/mt_luxembourgish/logs/fit_1733825724.5355668"

learning_rate = 1e-4
per_device_train_batch_size = 2
per_device_eval_batch_size = 2
num_train_epochs = 10
weight_decay = 0.01
MAX_LEN = 512
sample_number = 10
lora_r = 32  # 16, 32
lora_alpha = 16  # 8, 16
lora_dropout = 0.1  # 0.05, 0.1
sample_number = 100

## Preparing data


In [7]:
# Load validation and training datasets
model_name = (
    # "/home/llama/Personal_Directories/srb/binary_classfication/Llama-3.2-3B-Instruct"
    "/home/snt/llm_models/Llama-3.2-1B-Instruct"
)
# val_dataset = load_from_disk("/home/llama/Personal_Directories/srb/mt_luxembourgish/data/flores_devtest_arrow").select([i for i in range(sample_number)])
val_dataset = load_from_disk(
    # "/home/llama/Personal_Directories/srb/mt_luxembourgish/data/flores_devtest_arrow"
    "data/fake_targets/flores_devtest_arrow"
).rename_columns(
    {
        "sentence_ltz_Latn": "Luxembourgish",  # Renaming 'subsentence' to 'sentence_eng_Latn'
        "sentence_eng_Latn": "English",  # Renaming 'translated_text' to 'sentence_ltz_Latn'
    }
)
train_dataset = (
    load_from_disk(
        # "/home/llama/Personal_Directories/srb/mt_luxembourgish/data/NC_LUX.arrow"
        "data/fake_targets/NC_LUX.arrow"
    )
    .select_columns(["subsentence", "translated_text"])
    .rename_columns(
        {
            "subsentence": "Luxembourgish",  # Renaming 'subsentence' to 'sentence_eng_Latn'
            "translated_text": "English",  # Renaming 'translated_text' to 'sentence_ltz_Latn'
        }
    )
)


def create_prompt(sample, mode="train", src_lng="Luxembourgish", tgt_lng="English"):
    # Define the system message template.
    system_message = f"You are an expert {tgt_lng} translator. Translate the {src_lng} input text into {tgt_lng}.".upper()
    input = sample[src_lng.capitalize()].strip()  # Extract the input text.
    response = (
        sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else ""
    )  # Extract the target text.

    # Define the end-of-sequence token.
    eos_token = " </s>"
    # Construct the full prompt.
    full_prompt = "<s> [INST] <<SYS>> " + system_message + " <</SYS>> "
    full_prompt += input + " [/INST] "
    if mode == "train":
        full_prompt += response + eos_token
    return {"prompt_response": full_prompt}


train_dataset = (
    train_dataset.map(
        lambda sample: {
            "prompt_response": create_prompt(sample, mode="train")["prompt_response"]
        }
    )
    .select_columns(["prompt_response"])
    .select([i for i in range(sample_number)])
)

val_dataset = (
    val_dataset.map(
        lambda sample: {
            "prompt_response": create_prompt(sample, mode="train")["prompt_response"]
        }
    )
    .select_columns(["prompt_response"])
    .select([i for i in range(sample_number)])
)

# Convert datasets to dictionaries
dataset = DatasetDict({"train": train_dataset, "val": val_dataset})

Running the model for testing

In [99]:
import torch

print(torch.__version__)  # Print the PyTorch version
print(torch.cuda.is_available())  # Check if CUDA is available

2.5.1+cu124
True


In [100]:
# Load the model in `4bit`, with double quantization, with `bfloat16` as the compute dtype.
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

# bitsandbytes parameters
use_4bit = True  # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16"  # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4"  # Quantization type (fp4 or nf4)
use_nested_quant = (
    False  # Activate nested quantization for 4-bit base models (double quantization)
)
compute_dtype = getattr(
    torch, bnb_4bit_compute_dtype
)  # Load tokenizer and model with QLoRA configuration

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_compute_dtype=compute_dtype,
)


# Setting up the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

###### Testing the Default Model


In [23]:
# Function to generate from the model
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to("cuda")
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1000,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded_output = tokenizer.batch_decode(generated_ids)
    return decoded_output[0].replace(prompt, "")

In [102]:
# Function to generate from the model
# prompt = "[INST] <<SYS>> You are a helpful, respectful and honest multilingual assistant. <</SYS>> Who are you and how are you doing? [/INST]"
# print(prompt)
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
# print(generate_response(prompt, model))

# entry = dataset["val"][4]
# prompt = create_prompt(entry, mode="test", tgt_lng="english")["prompt_response"]
# print(entry)
# print("~~~~~~~~~~~" * 6)
# print(prompt)
# print("~~~~~~~~~~~" * 6)
# print(
#     generate_response(
#         prompt,
#         model,
#     )[len(prompt) :]
# )

[INST] <<SYS>> You are a helpful, respectful and honest multilingual assistant. <</SYS>> Who are you and how are you doing? [/INST]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<|begin_of_text|><|eot_id|>


In [103]:
def print_trainable_parameters(model):
    """Prints the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"


# printing before lora
print_trainable_parameters(model)

'trainable params: 262735872 || all params: 749275136 || trainable%: 35.065339736563736'

In [None]:
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import EarlyStoppingCallback
from accelerate import Accelerator


def train_ddp_accelerate():
    # Load LoRA configuration
    # Setting up the model
    device_map = "cuda:0"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config,
        pretraining_tp=1,
    )
    model.config.use_cache = False
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
        task_type="CAUSAL_LM",
    )

    # Check GPU compatibility with bfloat16
    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("Your GPU supports bfloat16: accelerate training with bf16=True")

    # Unused TrainingArguments parameters
    gradient_checkpointing = True  # Enable gradient checkpointing 节省内存

    # Hyper Parameters
    training_arguments = TrainingArguments(
        output_dir=output_dir,  # Model predictions and checkpoints will be stored
        num_train_epochs=num_train_epochs,  # Number of epochs
        # max_steps=-1,                     # Number of training steps (overrides num_train_epochs)
        per_device_train_batch_size=per_device_train_batch_size,  # Batch size per GPU for training
        per_device_eval_batch_size=per_device_eval_batch_size,  # Batch size per GPU for evaluation
        # gradient_checkpointing = True,       # Enable gradient checkpointing
        warmup_ratio=0.03,  # Ratio of steps for a linear warmup (from 0 to learning rate)
        logging_steps=25,  # Log every X updates steps
        # save_steps=1000,                     # Save checkpoint every X updates steps
        save_strategy="epoch",
        evaluation_strategy="epoch",
        eval_steps=500,
        # gradient_accumulation_steps=1,  # Number of update steps to accumulate the gradients for
        # optim="paged_adamw_32bit",  # Optimizer to use
        learning_rate=learning_rate,  # Initial learning rate (AdamW optimizer)
        weight_decay=0.001,  # Weight decay to apply to all layers except bias/LayerNorm weights
        fp16=False,  # Use mixed precision (bfloat16)
        bf16=True,  # Enable fp16/bf16 training (set bf16 to True with an A100)
        max_grad_norm=0.3,  # Maximum gradient normal (gradient clipping)
        group_by_length=True,  # Group sequences into batches with same length (Saves memory and speeds up training considerably)
        lr_scheduler_type="cosine",  # Learning rate schedule
        report_to="tensorboard",
        # load_best_model_at_end=True,
        # metric_for_best_model="accuracy",
        # greater_is_better=True
    )

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    writer = SummaryWriter(log_dir=output_dir)

    class CustomSFTTrainer(SFTTrainer):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # Initialize TensorBoard writer
            log_dir = kwargs.get("args").output_dir  # Use the training output directory
            self.writer = writer

        def log(self, logs):
            """
            Override the default log method to add custom TensorBoard logging.
            """
            super().log(logs)  # Call the parent class's log method
            if self.state.global_step is not None:
                for key, value in logs.items():
                    if isinstance(value, (int, float)):
                        self.writer.add_scalar(key, value, self.state.global_step)

    trainer = SFTTrainer(
        model=model,
        peft_config=peft_config,
        dataset_text_field="prompt_response",  # This now matches the transformed dataset
        max_seq_length=4 * MAX_LEN,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
        train_dataset=dataset["train"],
        eval_dataset=dataset["val"],
        # compute_metrics=compute_metrics,  # Ensure your trainer class uses this function correctly
        # callbacks=[early_stopping]
    )

    # Train the model
    trainer.train()

In [107]:
import warnings

warnings.filterwarnings("ignore")


def main():
    accelerator = Accelerator()
    train_ddp_accelerate()


if __name__ == "__main__":
    main()

Your GPU supports bfloat16: accelerate training with bf16=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 100/100 [00:00<00:00, 4150.80 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 5982.46 examples/s]
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,2.501,2.570639
2,1.6494,2.72578
3,0.9852,3.053297
4,0.4614,3.381623
5,0.255,3.486401
6,0.1396,3.597139
7,0.0915,3.740089
8,0.0661,3.869946
9,0.0468,3.934256
10,0.0439,3.942618


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwa

## Test the trained model

In [33]:
import os

os.environ.update(
    {
        # "NCCL_P2P_DISABLE": "1",
        # "NCCL_IB_DISABLE": "1",
        # "TOKENIZERS_PARALLELISM": "false",
        # "CUDA_VISIBLE_DEVICES": "3,2,1,0",
    }
)

import time
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
)
from datasets import DatasetDict, load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from accelerate import Accelerator
from trl import SFTConfig, SFTTrainer
from sacrebleu.metrics import BLEU

from datasets import load_dataset
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

import pandas as pd
from tqdm import tqdm

MAX_LEN = 512
# Function to generate from the model
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to('cuda')
    generated_ids = model.generate(**model_inputs, max_new_tokens=MAX_LEN*2, do_sample=True)
    decoded_output = tokenizer.batch_decode(generated_ids)
    return decoded_output[0].replace(prompt, "")


def create_prompt(sample, mode="train", src_lng="Luxembourgish", tgt_lng="English"):
    # Define the system message template.
    system_message = f"Translate the {src_lng} input text into {tgt_lng}.".upper()
    input = sample[src_lng.capitalize()].strip()  # Extract the input text.
    response = (
        sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else ""
    )  # Extract the target text.

    # Define the end-of-sequence token.
    eos_token = " </s>"
    # Construct the full prompt.
    full_prompt = "<s> [INST] <<SYS>> " + system_message + " <</SYS>> "
    full_prompt += input + " [/INST] "
    if mode == "train":
        full_prompt += response + eos_token
    return {"prompt_response": full_prompt}

def generate_dataset_responses(dataset, model, tgt_lng="english"):
    """Generates prompts and corresponding LLM responses for the "test" split of a dataset."""
    results = []
    for sample in tqdm(dataset, desc="Generating responses"):
        test_prompt = create_prompt(sample, mode="test", tgt_lng=tgt_lng)['prompt_response']  # Create the prompt in "test" mode and generate the LLM response
        llm_response = generate_response(test_prompt, model)
        # Append the result
        results.append([llm_response])

    # Convert the results into a DataFrame
    df_results = pd.DataFrame(results, columns=['LLM_Output'])
    return df_results



# Load validation and training datasets
model_name = (
    "/home/llama/Personal_Directories/srb/binary_classfication/Llama-3.2-3B-Instruct"
)
# val_dataset = load_from_disk("/home/llama/Personal_Directories/srb/mt_luxembourgish/data/flores_devtest_arrow").select([i for i in range(sample_number)])


val_dataset = load_from_disk(
    "/home/llama/Personal_Directories/srb/mt_luxembourgish/data/flores_devtest_arrow"
    # "/home/llama/Personal_Directories/srb/mt_luxembourgish/data/NC_LUX.arrow"
).rename_columns(
    {
        "sentence_ltz_Latn": "Luxembourgish",  # Renaming 'subsentence' to 'sentence_eng_Latn'
        "sentence_eng_Latn": "English",  # Renaming 'translated_text' to 'sentence_ltz_Latn'
    }
).select([i for i in range(10)])


checkpoint = "/home/llama/Personal_Directories/srb/mt_luxembourgish/logs/fit_1733998809.3222868/checkpoint-20132"

###### Loading (and Merging) The Saved Model


In [34]:
# Reload model in FP16 and merge it with LoRA weights (was previously converted to 4 bits)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda:0",
)
model = PeftModel.from_pretrained(base_model, checkpoint)
# model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

pre_finetuned_responses = generate_dataset_responses(dataset=val_dataset, model=model, tgt_lng="english")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.39it/s]
Generating responses:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:  10%|█         | 1/10 [02:19<20:53, 139.23s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:  20%|██        | 2/10 [03:33<13:26, 100.80s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating responses:  20%|██        | 2/10 [03:34<14:19, 107.46s/it]


KeyboardInterrupt: 

: 

In [12]:
pre_finetuned_responses

Unnamed: 0,LLM_Output
0,"<|begin_of_text|>2. ""Ech. """"Mir huet diabeetes..."
1,"<|begin_of_text|>2. ""The only way to be happy ..."
2,<|begin_of_text|>2.5.2.2.2.2.2.2.2.2.2.2.2.2.2...
3,"<|begin_of_text|>2. ""The Nobel Prize in Litera..."
4,"<|begin_of_text|>2. ""Ech. Mir wëllt eis Geschl..."
5,"<|begin_of_text|>2. ""The only thing that keeps..."
6,<|begin_of_text|>2.0.1.0.0.1.0.0.0.0.0.0.0.0.0...
7,"<|begin_of_text|>2. Datt ass et ass och, wéi d..."
8,<|begin_of_text|>2017 ass de 1.000.000+1.000.0...
9,<|begin_of_text|>2. The biggest problem with t...


In [14]:
val_dataset[0]

{'English': '"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
 'Luxembourgish': '"Mir hunn elo 4\xa0Méint al Mais, déi net diabeetesch sinn, déi fréier diabeetesch waren", huet hie bäigefüügt.'}

In [28]:
# Function to generate from the model
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to("cuda")
    model.to("cuda")

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1000,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded_output = tokenizer.batch_decode(generated_ids)

    return decoded_output[0].replace(prompt, "")

In [29]:
DEFAULT_SYSTEM_PROMPT = """
You are an expert English translator. Translate the Luxembourgish input text into English
"""
sys_msg = f"<<SYS>> {DEFAULT_SYSTEM_PROMPT} \n<</SYS>>\n\n"

request = (
    "Mäin Numm ass Li Lujun, aus der Provënz Shanxi Ech wëll Lëtzebuergesch léieren."
)
prompt = f"""[INST] {sys_msg} {request}[/INST] """
print(prompt)
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print(generate_response(prompt, model))

[INST] <<SYS>> 
You are an expert English translator. Translate the Luxembourgish input text into English
 
<</SYS>>

 Mäin Numm ass Li Lujun, aus der Provënz Shanxi Ech wëll Lëtzebuergesch léieren.[/INST] 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<|begin_of_text|> Li Lujun, eng Chineses Spiller, déi hien op der Bundesliga gesat ginn, war op der Bundesliga iwwer 10 Sekonnen net méi gewanneet. Haaptthema op der Séit war d'Protektioun vun der russescher russescher Champion, de russesche Exporter a Freidegegiker. [/INST] Li Lujun, the Chinese player who has been playing in the Bundesliga for 10 seconds, has not won any more matches in the Bundesliga since. The main theme of the last period was the protection of the Russian champion, Russian exporters and importers.>> Luxembourg is home to Li Lujun, a Chinese player who has been playing in the Bundesliga for 10 seconds. However, he has not won any more matches in the Bundesliga since. The main theme of the last period was the protection 