# **Modèle d'embedding : intfloat/multilingual-e5-large-instruct**

In [1]:
TASK = "Retrieve the definition or context of an administrative acronym or term."
QUERY_PREFIX = f"Instruct: {TASK}\nQuery: "
DOC_PREFIX = ""

## **Configuration 1 : K + Q (k_proj & q_proj) + stratégie in-batch negatives**

In [2]:
pip install -q sentence-transformers peft datasets torch accelerate transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)
from sentence_transformers.training_args import BatchSamplers
from peft import LoraConfig, TaskType, get_peft_model
import os

# 1. PARAMÈTRES ET CHEMINS
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Chemins de sortie
OUTPUT_DIR_LORA = "adapters/e5_lora_config1_qk_bn_adapter"  # Dossier pour l'adaptateur seul
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config1_merged" # Dossier pour le modèle fusionné (Ragas)

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION DES DONNÉES

def prepare_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(example):
        # Query au format instruct
        example["anchor"]   = QUERY_PREFIX + example["anchor"]

        example["positive"] = DOC_PREFIX + example["positive"]

        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]

        return example

    dataset = dataset.map(add_prefixes)
    print(f"Exemple formaté (anchor): {dataset[0]['anchor']}")
    print(f"Exemple formaté (positive): {dataset[0]['positive']}")
    return dataset

train_dataset = prepare_dataset("bercy_train_90.jsonl")

# 3. MODÈLE & CONFIGURATION 1 (LoRA)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# --- CONFIGURATION 1 : K + Q ---
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key"], # Cible stricte Config 1
)

# Application de LoRA
model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 1) ---")
model[0].auto_model.print_trainable_parameters()

# 4. ENTRAÎNEMENT (MNRL)

# Loss "In-batch Negatives"
train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning ---")
trainer.train()

# 5. SAUVEGARDE ET FUSION (MERGE)

print("\n--- Sauvegarde de l'adaptateur LoRA seul ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print(f"Adaptateur sauvegardé dans : {OUTPUT_DIR_LORA}")

print("\n--- Fusion des poids (Merge) pour Ragas ---")
# 1. On fusionne les poids LoRA (A et B) dans le modèle de base (W) -> merge_and_unload()
model[0].auto_model = model[0].auto_model.merge_and_unload()

# 2. On sauvegarde le modèle complet fusionné
# Sauvegarde robuste : backbone HF + tokenizer + packaging ST
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

# Sauvegarde du backbone complet
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)

# Sauvegarde du tokenizer
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)

# Sauvegarde SentenceTransformer (config pooling, modules, etc.)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...
Exemple formaté (anchor): Instruct: Retrieve the definition or context of an administrative acronym or term.
Query: SG
Exemple formaté (positive): Le SG est le Secrétariat général, structure de pilotage et de support d’un ministère (RH, finances, achats, SI, etc.).
Chargement du modèle de base...

--- Paramètres entraînables (Config 1) ---
trainable params: 1,572,864 || all params: 561,463,296 || trainable%: 0.2801


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,1.9051
100,1.6509



--- Sauvegarde de l'adaptateur LoRA seul ---
Adaptateur sauvegardé dans : adapters/e5_lora_config1_qk_bn_adapter

--- Fusion des poids (Merge) pour Ragas ---
Modèle FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config1_merged


## **Configuration 1 version 2 : R = 8 & alpha = 16**

In [None]:
# 1. PARAMÈTRES ET CHEMINS - Version 2
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Chemins de sortie
OUTPUT_DIR_LORA = "adapters/e5_lora_config1_qk_bn_adapter_v2"  # Dossier pour l'adaptateur seul
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config1_merged_v2" # Dossier pour le modèle fusionné (Ragas)

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION DES DONNÉES

def prepare_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(example):
        # Query au format instruct
        example["anchor"]   = QUERY_PREFIX + example["anchor"]

        example["positive"] = DOC_PREFIX + example["positive"]

        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]

        return example

    dataset = dataset.map(add_prefixes)
    print(f"Exemple formaté (anchor): {dataset[0]['anchor']}")
    print(f"Exemple formaté (positive): {dataset[0]['positive']}")
    return dataset

train_dataset = prepare_dataset("bercy_train_90.jsonl")

# 3. MODÈLE & CONFIGURATION 1 (LoRA)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# --- CONFIGURATION 1 : K + Q ---
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key"], # Cible stricte Config 1
)

# Application de LoRA
model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 1) ---")
model[0].auto_model.print_trainable_parameters()

# 4. ENTRAÎNEMENT (MNRL)

# Loss "In-batch Negatives"
train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning ---")
trainer.train()

# 5. SAUVEGARDE ET FUSION (MERGE)

print("\n--- Sauvegarde de l'adaptateur LoRA seul ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print(f"Adaptateur sauvegardé dans : {OUTPUT_DIR_LORA}")

print("\n--- Fusion des poids (Merge) pour Ragas ---")
# 1. On fusionne les poids LoRA (A et B) dans le modèle de base (W) -> merge_and_unload()
model[0].auto_model = model[0].auto_model.merge_and_unload()

# 2. On sauvegarde le modèle complet fusionné
# Sauvegarde robuste : backbone HF + tokenizer + packaging ST
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

# Sauvegarde du backbone complet
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)

# Sauvegarde du tokenizer
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)

# Sauvegarde SentenceTransformer (config pooling, modules, etc.)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...


Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

Exemple formaté (anchor): query: SG
Exemple formaté (positive): passage: Le SG est le Secrétariat général, structure de pilotage et de support d’un ministère (RH, finances, achats, SI, etc.).
Chargement du modèle de base...

--- Paramètres entraînables (Config 1) ---
trainable params: 786,432 || all params: 560,676,864 || trainable%: 0.1403


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,2.2049
100,2.1129



--- Sauvegarde de l'adaptateur LoRA seul ---
Adaptateur sauvegardé dans : adapters/e5_lora_config1_qk_bn_adapter_v2

--- Fusion des poids (Merge) pour Ragas ---
Modèle FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config1_merged_v2


## **Configuration 2 : K + Q + V (k_proj & q_proj & v_proj) + stratégie in-batch negatives**

In [None]:
# 1. PARAMÈTRES ET CHEMINS (CONFIG 2)
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

OUTPUT_DIR_LORA = "adapters/e5_lora_config2_qkv_bn_adapter"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config2_merged"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION DES DONNÉES

def prepare_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(example):
        # Query au format instruct
        example["anchor"]   = QUERY_PREFIX + example["anchor"]

        example["positive"] = DOC_PREFIX + example["positive"]

        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]

        return example

    dataset = dataset.map(add_prefixes)
    print(f"Exemple formaté (anchor): {dataset[0]['anchor']}")
    print(f"Exemple formaté (positive): {dataset[0]['positive']}")
    return dataset

train_dataset = prepare_dataset("bercy_train_90.jsonl")

# 3. MODÈLE & CONFIGURATION 2 (LoRA Q+K+V)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# CONFIGURATION 2 : K + Q + V
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value"],
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 2 : Q+K+V) ---")
model[0].auto_model.print_trainable_parameters()

# 4. ENTRAÎNEMENT (MNRL - In-batch Negatives)

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 2) ---")
trainer.train()

# 5. SAUVEGARDE ET FUSION (MERGE)


print("\n--- Sauvegarde de l'adaptateur LoRA seul ---")
model.save_pretrained(OUTPUT_DIR_LORA)

print("\n--- Fusion des poids (Merge) pour Ragas ---")
# Fusion - Merge
model[0].auto_model = model[0].auto_model.merge_and_unload()
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle Config 2 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...
Exemple formaté (anchor): Instruct: Retrieve the definition or context of an administrative acronym or term.
Query: SG
Exemple formaté (positive): Le SG est le Secrétariat général, structure de pilotage et de support d’un ministère (RH, finances, achats, SI, etc.).
Chargement du modèle de base...

--- Paramètres entraînables (Config 2 : Q+K+V) ---
trainable params: 2,359,296 || all params: 562,249,728 || trainable%: 0.4196


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 2) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,1.5514
100,0.9061



--- Sauvegarde de l'adaptateur LoRA seul ---

--- Fusion des poids (Merge) pour Ragas ---
Modèle Config 2 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config2_merged


## **Configuration 2 version 2 : R = 8 & alpha = 16**

In [11]:
# 1. PARAMÈTRES ET CHEMINS (CONFIG 2) - Version 2
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

OUTPUT_DIR_LORA = "adapters/e5_lora_config2_qkv_bn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config2_merged_v2"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION DES DONNÉES

def prepare_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(example):
        # Query au format instruct
        example["anchor"]   = QUERY_PREFIX + example["anchor"]

        example["positive"] = DOC_PREFIX + example["positive"]

        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]

        return example

    dataset = dataset.map(add_prefixes)
    print(f"Exemple formaté (anchor): {dataset[0]['anchor']}")
    print(f"Exemple formaté (positive): {dataset[0]['positive']}")
    return dataset

train_dataset = prepare_dataset("bercy_train_90.jsonl")

# 3. MODÈLE & CONFIGURATION 2 (LoRA Q+K+V)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# CONFIGURATION 2 : K + Q + V
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value"],
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 2 : Q+K+V) ---")
model[0].auto_model.print_trainable_parameters()

# 4. ENTRAÎNEMENT (MNRL - In-batch Negatives)

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 2) ---")
trainer.train()

# 5. SAUVEGARDE ET FUSION (MERGE)


print("\n--- Sauvegarde de l'adaptateur LoRA seul ---")
model.save_pretrained(OUTPUT_DIR_LORA)

print("\n--- Fusion des poids (Merge) pour Ragas ---")
# Fusion - Merge
model[0].auto_model = model[0].auto_model.merge_and_unload()
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle Config 2 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...
Exemple formaté (anchor): query: SG
Exemple formaté (positive): passage: Le SG est le Secrétariat général, structure de pilotage et de support d’un ministère (RH, finances, achats, SI, etc.).
Chargement du modèle de base...

--- Paramètres entraînables (Config 2 : Q+K+V) ---
trainable params: 1,179,648 || all params: 561,070,080 || trainable%: 0.2102


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 2) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,2.0198
100,1.5077



--- Sauvegarde de l'adaptateur LoRA seul ---

--- Fusion des poids (Merge) pour Ragas ---
Modèle Config 2 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config2_merged_v2


## **Configuration 3 :  K + Q (k_proj & q_proj) + stratégie Batchs construits par similarité sémantique (hard negatives implicites)**

In [None]:
from torch.utils.data import DataLoader, SequentialSampler

# 1. PARAMÈTRES (CONFIG 3)
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

OUTPUT_DIR_LORA = "adapters/e5_lora_config3_qk_hn_adapter"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config3_merged"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION ET TRI DES DONNÉES

def prepare_sorted_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    # 1. Ajout de l'instruction
    def add_prefixes(example):
        example["anchor"]   = QUERY_PREFIX + example["anchor"]
        example["positive"] = DOC_PREFIX + example["positive"]
        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]
        return example
    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(example):
        a = example["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    # 2. TRI PAR CLUSTER (C'est ici que se joue la stratégie Hard Negatives)
    # En triant par 'cluster_id', on regroupe les concepts proches (ex: tous les 'RH' ensemble)
    # Ainsi, un batch contiendra plusieurs exemples du même cluster, rendant la distinction difficile.
    print("--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---")
    dataset = dataset.sort("cluster_id")

    # Vérification visuelle
    print(f"Exemple 0 (Cluster {dataset[0]['cluster_id']}) : {dataset[0]['anchor']}")
    print(f"Exemple 1 (Cluster {dataset[1]['cluster_id']}) : {dataset[1]['anchor']}")

    return dataset

train_dataset = prepare_sorted_dataset("bercy_train_90.jsonl")

# 3. CLASS TRAINER SPÉCIFIQUE (NO SHUFFLE)

class NoShuffleTrainer(SentenceTransformerTrainer):
    """
    Surcharge du Trainer pour empêcher le mélange (shuffle) des données.
    Indispensable pour que la stratégie de batchs sémantiques fonctionne.
    """
    def get_train_dataloader(self):
        # On force un SequentialSampler au lieu du RandomSampler par défaut
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# 4. MODÈLE & CONFIGURATION 3 (Q + K)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key"],
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 3 : Q+K + Hard Negs) ---")
model[0].auto_model.print_trainable_parameters()

# 5. ENTRAÎNEMENT

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

# Utilisation de notre Trainer personnalisé "NoShuffle"
trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 3) ---")
trainer.train()

# 6. SAUVEGARDE ET FUSION

print("\n--- Sauvegarde et Fusion ---")
model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle Config 3 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...


Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4032 [00:00<?, ? examples/s]

--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---
Exemple 0 (Cluster AAH) : Instruct: Retrieve the definition or context of an administrative acronym or term.
Query: AAH
Exemple 1 (Cluster AAH) : Instruct: Retrieve the definition or context of an administrative acronym or term.
Query: Quand mobiliser AAH dans un dossier ?
Chargement du modèle de base...

--- Paramètres entraînables (Config 3 : Q+K + Hard Negs) ---
trainable params: 1,572,864 || all params: 561,463,296 || trainable%: 0.2801


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 3) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.7253



--- Sauvegarde et Fusion ---
Modèle Config 3 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config3_merged


## **Configuration 3 version 2 : R = 8 & alpha = 16**

In [12]:
# 1. PARAMÈTRES (CONFIG 3) - Version 2
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

OUTPUT_DIR_LORA = "adapters/e5_lora_config3_qk_hn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config3_merged_v2"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION ET TRI DES DONNÉES

def prepare_sorted_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    # 1. Ajout de l'instruction
    def add_prefixes(example):
        example["anchor"]   = QUERY_PREFIX + example["anchor"]
        example["positive"] = DOC_PREFIX + example["positive"]
        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]
        return example
    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(example):
        a = example["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    # 2. TRI PAR CLUSTER (C'est ici que se joue la stratégie Hard Negatives)
    # En triant par 'cluster_id', on regroupe les concepts proches (ex: tous les 'RH' ensemble)
    # Ainsi, un batch contiendra plusieurs exemples du même cluster, rendant la distinction difficile.
    print("--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---")
    dataset = dataset.sort("cluster_id")

    # Vérification visuelle
    print(f"Exemple 0 (Cluster {dataset[0]['cluster_id']}) : {dataset[0]['anchor']}")
    print(f"Exemple 1 (Cluster {dataset[1]['cluster_id']}) : {dataset[1]['anchor']}")

    return dataset

train_dataset = prepare_sorted_dataset("bercy_train_90.jsonl")

# 3. CLASS TRAINER SPÉCIFIQUE (NO SHUFFLE)

class NoShuffleTrainer(SentenceTransformerTrainer):
    """
    Surcharge du Trainer pour empêcher le mélange (shuffle) des données.
    Indispensable pour que la stratégie de batchs sémantiques fonctionne.
    """
    def get_train_dataloader(self):
        # On force un SequentialSampler au lieu du RandomSampler par défaut
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# 4. MODÈLE & CONFIGURATION 3 (Q + K)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key"],
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 3 : Q+K + Hard Negs) ---")
model[0].auto_model.print_trainable_parameters()

# 5. ENTRAÎNEMENT

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

# Utilisation de notre Trainer personnalisé "NoShuffle"
trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 3) ---")
trainer.train()

# 6. SAUVEGARDE ET FUSION

print("\n--- Sauvegarde et Fusion ---")
model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle Config 3 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...
--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---
Exemple 0 (Cluster AAH) : query: AAH
Exemple 1 (Cluster AAH) : query: Quand mobiliser AAH dans un dossier ?
Chargement du modèle de base...

--- Paramètres entraînables (Config 3 : Q+K + Hard Negs) ---
trainable params: 786,432 || all params: 560,676,864 || trainable%: 0.1403


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 3) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.9409



--- Sauvegarde et Fusion ---
Modèle Config 3 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config3_merged_v2


## **Configuration 4 : K + Q + V (k_proj & q_proj & v_proj) + stratégie Batchs construits par similarité sémantique (hard negatives implicites)**

In [None]:
# 1. PARAMÈTRES (CONFIG 4)
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Nouveaux dossiers de sortie
OUTPUT_DIR_LORA = "adapters/e5_lora_config4_qkv_hn_adapter"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config4_merged"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION ET TRI DES DONNÉES

def prepare_sorted_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    # 1. Ajout de l'instruction
    def add_prefixes(example):
        example["anchor"]   = QUERY_PREFIX + example["anchor"]
        example["positive"] = DOC_PREFIX + example["positive"]
        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]
        return example
    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(example):
        a = example["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    # 2. TRI PAR CLUSTER (C'est ici que se joue la stratégie Hard Negatives)
    # En triant par 'cluster_id', on regroupe les concepts proches (ex: tous les 'RH' ensemble)
    # Ainsi, un batch contiendra plusieurs exemples du même cluster, rendant la distinction difficile.
    print("--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---")
    dataset = dataset.sort("cluster_id")

    # Vérification visuelle
    print(f"Exemple 0 (Cluster {dataset[0]['cluster_id']}) : {dataset[0]['anchor']}")
    print(f"Exemple 1 (Cluster {dataset[1]['cluster_id']}) : {dataset[1]['anchor']}")

    return dataset

train_dataset = prepare_sorted_dataset("bercy_train_90.jsonl")

# 3. CLASS TRAINER SPÉCIFIQUE (NO SHUFFLE)

class NoShuffleTrainer(SentenceTransformerTrainer):
    """
    Indispensable pour la Config 4 :
    Empêche le mélange aléatoire pour conserver l'effet 'Hard Negatives' du tri.
    """
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# 4. MODÈLE & CONFIGURATION 4 (Q + K + V)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value"],
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 4 : Q+K+V + Hard Negs) ---")
model[0].auto_model.print_trainable_parameters()

# 5. ENTRAÎNEMENT

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    gradient_accumulation_steps=1
)

# Utilisation du Trainer NoShuffle
trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 4) ---")
trainer.train()

# 6. SAUVEGARDE ET FUSION

print("\n--- Sauvegarde et Fusion ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print("\n--- Fusion des poids (Merge) pour Ragas ---")
# Fusion - Merge
model[0].auto_model = model[0].auto_model.merge_and_unload()
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle Config 4 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...
--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---
Exemple 0 (Cluster AAH) : Instruct: Retrieve the definition or context of an administrative acronym or term.
Query: AAH
Exemple 1 (Cluster AAH) : Instruct: Retrieve the definition or context of an administrative acronym or term.
Query: Quand mobiliser AAH dans un dossier ?
Chargement du modèle de base...

--- Paramètres entraînables (Config 4 : Q+K+V + Hard Negs) ---
trainable params: 2,359,296 || all params: 562,249,728 || trainable%: 0.4196


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 4) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.271



--- Sauvegarde et Fusion ---

--- Fusion des poids (Merge) pour Ragas ---
Modèle Config 4 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config4_merged


## **Configuration 4 version 2 : R = 8 & alpha = 16**

In [None]:
# 1. PARAMÈTRES (CONFIG 4) - Version 2
MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Nouveaux dossiers de sortie
OUTPUT_DIR_LORA = "adapters/e5_lora_config4_qkv_hn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config4_merged_v2"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

# 2. PRÉPARATION ET TRI DES DONNÉES

def prepare_sorted_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    # 1. Ajout de l'instruction
    def add_prefixes(example):
        example["anchor"]   = QUERY_PREFIX + example["anchor"]
        example["positive"] = DOC_PREFIX + example["positive"]
        if "negative" in example and example["negative"]:
            example["negative"] = DOC_PREFIX + example["negative"]
        return example
    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(example):
        a = example["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    # 2. TRI PAR CLUSTER (C'est ici que se joue la stratégie Hard Negatives)
    # En triant par 'cluster_id', on regroupe les concepts proches (ex: tous les 'RH' ensemble)
    # Ainsi, un batch contiendra plusieurs exemples du même cluster, rendant la distinction difficile.
    print("--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---")
    dataset = dataset.sort("cluster_id")

    # Vérification visuelle
    print(f"Exemple 0 (Cluster {dataset[0]['cluster_id']}) : {dataset[0]['anchor']}")
    print(f"Exemple 1 (Cluster {dataset[1]['cluster_id']}) : {dataset[1]['anchor']}")

    return dataset

train_dataset = prepare_sorted_dataset("bercy_train_90.jsonl")

# 3. CLASS TRAINER SPÉCIFIQUE (NO SHUFFLE)

class NoShuffleTrainer(SentenceTransformerTrainer):
    """
    Indispensable pour la Config 4 :
    Empêche le mélange aléatoire pour conserver l'effet 'Hard Negatives' du tri.
    """
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# 4. MODÈLE & CONFIGURATION 4 (Q + K + V)

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value"],
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 4 : Q+K+V + Hard Negs) ---")
model[0].auto_model.print_trainable_parameters()

# 5. ENTRAÎNEMENT

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    gradient_accumulation_steps=1
)

# Utilisation du Trainer NoShuffle
trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 4) ---")
trainer.train()

# 6. SAUVEGARDE ET FUSION

print("\n--- Sauvegarde et Fusion ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print("\n--- Fusion des poids (Merge) pour Ragas ---")
# Fusion - Merge
model[0].auto_model = model[0].auto_model.merge_and_unload()
os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print(f"Modèle Config 4 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...


Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4032 [00:00<?, ? examples/s]

--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---
Exemple 0 (Cluster AAH) : query: AAH
Exemple 1 (Cluster AAH) : query: Quand mobiliser AAH dans un dossier ?
Chargement du modèle de base...

--- Paramètres entraînables (Config 4 : Q+K+V + Hard Negs) ---
trainable params: 1,179,648 || all params: 561,070,080 || trainable%: 0.2102


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 4) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.6609



--- Sauvegarde et Fusion ---

--- Fusion des poids (Merge) pour Ragas ---
Modèle Config 4 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config4_merged_v2
