In [14]:
# Installing necessary libraries for data processing and model fine-tuning
!pip install datasets
!pip install -U bitsandbytes
!pip install PEFT
!pip install wandb
!pip install evaluate
!pip install sacrebleu



# Loading the dataset


We will use the [datasets](http://) library to load the data and get the metric we need to use for evaluation.This can be easily done with the functions load_dataset and load_metric.

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
from datasets import load_dataset
# chargement des données
data = load_dataset("/content/drive/MyDrive/data/bidirection/json")


# Dataset reduction (optional)

We are trying to reduce our dataset here in order to carry out our experiments. This reduction allows us to conduct multiple experiments, save resources in terms of time and memory, and find the right hyperparameters. Once this is done, we will use these parameters to train the model on the entire dataset.

In [17]:
from datasets import DatasetDict

# Prendre 10% des données d'entraînement, de validation et de test
train_subset = data["train"].train_test_split(test_size=0.05)["test"]
validation_subset = data["validation"].train_test_split(test_size=0.1)["test"]
test_subset = data["test"].train_test_split(test_size=0.1)["test"]

# Créer un nouveau DatasetDict avec les sous-ensembles
subset_dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset,
    "test": test_subset
})

# Afficher le nombre de lignes dans chaque sous-ensemble
print(subset_dataset)

DatasetDict({
    train: Dataset({
        features: ['translation', 'codes'],
        num_rows: 14500
    })
    validation: Dataset({
        features: ['translation', 'codes'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['translation', 'codes'],
        num_rows: 1393
    })
})


In [18]:
data=subset_dataset

# QLoRA 

Here, we are using a method called QLoRA to reduce the size of the NLLB model in order to perform fine-tuning with minimal resources.

In [19]:
import wandb
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import torch

# BitsAndBytes quantization config for 4-bit model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# Load model and tokenizer with 4-bit quantization
model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q_proj", "k_proj", "v_proj"]
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]



# preprocessing

In [20]:
# Define the preprocess function to prepare translation data
max_length = 128
max_input_length   =  128
max_target_length =  128
source_lang =  "src"
target_lang =  "tgt"


def preprocess_function(examples):

    inputs = [ex[source_lang] for ex in examples["translation"]]
    target = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer( inputs , max_length = max_input_length , truncation = True , padding = True )

    # Configurer le tokenizer pour les cibles
    labels = tokenizer (target , max_length = max_target_length , truncation = True , padding = True )
    model_inputs [ "labels" ]  = labels [ "input_ids" ]
    return model_inputs



In [21]:
from transformers import AutoTokenizer

tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/14500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

In [22]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['translation', 'codes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14500
    })
    validation: Dataset({
        features: ['translation', 'codes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['translation', 'codes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
})

In [23]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# fonction evaluate model

In [24]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

# function training model

In [25]:
# (Optionnel) Initialiser W&B
import wandb
wandb.init(project="BAAMTU", name="fr-wolof-epoch2-lr2e5")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [35]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
    load_best_model_at_end=True
)

# Configurer le Trainer avec EarlyStoppingCallback
trainer = Seq2SeqTrainer(
    model=model,                        # Modèle à entraîner
    args=training_args,                 # Arguments d'entraînement
    train_dataset=tokenized_data["train"],  # Jeu de données d'entraînement
    eval_dataset=tokenized_data["validation"],  # Jeu de données de validation
    tokenizer=tokenizer,                # Tokenizer utilisé
    data_collator=data_collator,        # Collateur de données
    compute_metrics=compute_metrics,    # Fonction de calcul des métriques
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Arrêt anticipé
)

# Entraîner le modèle
trainer.train()

# Terminer la session Weights & Biases
wandb.finish()





  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 