In [25]:
# Installing necessary libraries for data processing and model fine-tuning
!pip install datasets
!pip install -U bitsandbytes
!pip install PEFT
!pip install wandb
!pip install evaluate
!pip install sacrebleu



In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the dataset


We will use the [datasets](http://) library to load the data and get the metric we need to use for evaluation.This can be easily done with the functions load_dataset and load_metric.

In [27]:
from datasets import load_dataset
# chargement des données
data = load_dataset("/content/drive/MyDrive/data/bidirection/json")


# Dataset reduction (optional)

We are trying to reduce our dataset here in order to carry out our experiments. This reduction allows us to conduct multiple experiments, save resources in terms of time and memory, and find the right hyperparameters. Once this is done, we will use these parameters to train the model on the entire dataset.

In [28]:
from datasets import DatasetDict

# Prendre 10% des données d'entraînement, de validation et de test
train_subset = data["train"].train_test_split(test_size=0.05)["test"]
validation_subset = data["validation"].train_test_split(test_size=0.1)["test"]
test_subset = data["test"].train_test_split(test_size=0.1)["test"]

# Créer un nouveau DatasetDict avec les sous-ensembles
subset_dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset,
    "test": test_subset
})

# Afficher le nombre de lignes dans chaque sous-ensemble
print(subset_dataset)

DatasetDict({
    train: Dataset({
        features: ['translation', 'codes'],
        num_rows: 14500
    })
    validation: Dataset({
        features: ['translation', 'codes'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['translation', 'codes'],
        num_rows: 1393
    })
})


In [29]:
data=subset_dataset

# LoRA (Quantized LoRA)

Here, we are using a method called QLoRA to reduce the size of the NLLB model in order to perform fine-tuning with minimal resources.

In [30]:
import wandb
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import torch



# Load model and tokenizer with 4-bit quantization
model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-3.3B")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q_proj", "k_proj", "v_proj"]
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]



# preprocessing

In [31]:
# Define the preprocess function to prepare translation data
max_length = 128
max_input_length   =  128
max_target_length =  128
source_lang =  "src"
target_lang =  "tgt"


def preprocess_function(examples):

    inputs = [ex[source_lang] for ex in examples["translation"]]
    target = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer( inputs , max_length = max_input_length , truncation = True , padding = True )

    # Configurer le tokenizer pour les cibles
    labels = tokenizer (target , max_length = max_target_length , truncation = True , padding = True )
    model_inputs [ "labels" ]  = labels [ "input_ids" ]
    return model_inputs



In [32]:
from transformers import AutoTokenizer

tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/14500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

In [33]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['translation', 'codes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14500
    })
    validation: Dataset({
        features: ['translation', 'codes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['translation', 'codes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
})

In [34]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# fonction evaluate model

In [35]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# function training model

In [36]:
# (Optionnel) Initialiser W&B
import wandb
wandb.init(project="BAAMTU", name="fr-wolof-epoch2-lr2e5")

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
train/epoch,0.13793
train/global_step,500.0
train/grad_norm,0.47119
train/learning_rate,1e-05
train/loss,11.6887


In [37]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
    load_best_model_at_end=True
)

# Configurer le Trainer avec EarlyStoppingCallback
trainer = Seq2SeqTrainer(
    model=model,                        # Modèle à entraîner
    args=training_args,                 # Arguments d'entraînement
    train_dataset=tokenized_data["train"],  # Jeu de données d'entraînement
    eval_dataset=tokenized_data["validation"],  # Jeu de données de validation
    tokenizer=tokenizer,                # Tokenizer utilisé
    data_collator=data_collator,        # Collateur de données
    compute_metrics=compute_metrics,    # Fonction de calcul des métriques
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Arrêt anticipé
)

# Entraîner le modèle
trainer.train()

# Terminer la session Weights & Biases
wandb.finish()





  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 7.06 MiB is free. Process 2501 has 14.74 GiB memory in use. Of the allocated memory 14.31 GiB is allocated by PyTorch, and 304.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)