# LLAMA3 Fine-tuning for machine translation


In [1]:
%pip install torch tensorboard evaluate scikit-learn transformers peft nltk seaborn trl sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [2]:
from huggingface_hub import login
import time
import pandas as pd
import torch
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import numpy as np
import torch.nn.functional as F
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


# Load Data and set env

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_from_disk, load_dataset


current = time.time()
writer = SummaryWriter(log_dir=f"logs/fit_{current}/")

learning_rate = 1e-4
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
num_train_epochs = 5
weight_decay = 0.01
MAX_LEN = 512
sample_size_training = 5

# Load validation and training datasets
model_name = "/home/lujun_li/projects/base_models/Llama-3.2-1B-Instruct"

val_dataset = load_from_disk("/home/lujun_li/projects/mt_luxembourgish/data/fake_targets/flores_devtest_arrow").select([i for i in range(sample_size_training)])
train_dataset = load_from_disk("/home/lujun_li/projects/mt_luxembourgish/data/fake_targets/NC_LUX.arrow").select_columns(["subsentence", "translated_text"]).rename_columns({
    "subsentence": "sentence_ltz_Latn",  # Renaming 'subsentence' to 'sentence_eng_Latn'
    "translated_text": "sentence_eng_Latn"  # Renaming 'translated_text' to 'sentence_ltz_Latn'
}).select([i for i in range(sample_size_training)])

# Convert datasets to dictionaries
dataset = DatasetDict({ 'train': train_dataset, 'val': val_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence_ltz_Latn', 'sentence_eng_Latn'],
        num_rows: 5
    })
    val: Dataset({
        features: ['sentence_eng_Latn', 'sentence_ltz_Latn'],
        num_rows: 5
    })
})

## Load Model


In [4]:
# Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# Lora
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type="CAUSAL_LM",
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# Create LLAMA tokenized dataset which will save time
def llama_preprocessing_function(examples):
    # Tokenize both English and Luxembourgish sentences
    tokenized_input = tokenizer(examples['sentence_ltz_Latn'], truncation=True, max_length=MAX_LEN, return_tensors="pt", padding='max_length')  # max length set to 512 for accelerating training
    tokenized_target = tokenizer(examples['sentence_eng_Latn'], truncation=True, max_length=MAX_LEN, return_tensors="pt", padding='max_length')
    
    # Return the tokenized sentences
    return {
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'labels': tokenized_target['input_ids']
    }

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets.set_format("torch")

# Data collator and trainer


In [None]:
from accelerate import Accelerator
from transformers import DataCollatorWithPadding, TrainingArguments
from trl import SFTTrainer
from sacrebleu.metrics import BLEU
import sentencepiece as spm


bleu = BLEU(tokenize="flores200", effective_order=True)

def calculate_sp_bleu(hypothesis, reference):
    return bleu.sentence_score(hypothesis, [reference]).score

def train_ddp_accelerate():
    
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model = prepare_model_for_kbit_training(model)
    # model = get_peft_model(model, lora_config)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    class CustomSFTTrainer(SFTTrainer):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # Initialize TensorBoard writer
            log_dir = kwargs.get('args').output_dir  # Use the training output directory
            self.writer = writer
            

        def log(self, logs):
            """
            Override the default log method to add custom TensorBoard logging.
            """
            super().log(logs)  # Call the parent class's log method
            if self.state.global_step is not None:
                for key, value in logs.items():
                    if isinstance(value, (int, float)):
                        self.writer.add_scalar(key, value, self.state.global_step)

        
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"logs/fit_{current}",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        evaluation_strategy='epoch',
        logging_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        report_to="tensorboard",
        ddp_find_unused_parameters=False,
        lr_scheduler_type="linear",
        warmup_steps=10
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        pred_ids = logits.argmax(axis=-1)
        pred_texts = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        bleu_scores = [bleu.sentence_score(hypothesis=pred, references=[ref]).score 
                    for pred, ref in zip(pred_texts, label_texts)]

        avg_bleu = sum(bleu_scores) / len(bleu_scores)
        return {"bleu": avg_bleu}
    

    # Define the trainer
    trainer = CustomSFTTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['val'],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        peft_config=lora_config,
        compute_metrics=compute_metrics
    )

    # Start training
    train_result = trainer.train()

    # Close the writer
    writer.close()



KeyError: <function tokenize_fn at 0x7f0e0071f490>

# Run trainer

In [None]:
from accelerate import notebook_launcher

# Call the function using notebook_launcher
notebook_launcher(
    train_ddp_accelerate, 
    args=(), 
    num_processes=1
)


Launching training on one GPU.




Epoch,Training Loss,Validation Loss,Bleu
1,No log,9.514058,0.39892
2,No log,7.985652,0.725209


KeyboardInterrupt: 