Configuration 1: K + Q (k_proj & q_proj) + stratégie in-batch negatives

In [None]:
pip install -q sentence-transformers peft datasets torch accelerate transformers

In [4]:
import torch
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)
from peft import LoraConfig, TaskType, get_peft_model
import os

# ==========================================
# 1. PARAMÈTRES ET CHEMINS
# ==========================================

MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Chemins de sortie
OUTPUT_DIR_LORA = "output/e5_lora_config1_qk_adapter"  # Dossier pour l'adaptateur seul
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_merged" # Dossier pour le modèle fusionné (Ragas)

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512

# Instruction E5
INSTRUCTION_PREFIX = "Instruct: Retrieve the meaning or context for the specific administrative acronym or question.\nQuery: "

# ==========================================
# 2. PRÉPARATION DES DONNÉES
# ==========================================

def prepare_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_instruction(example):
        # On ajoute le préfixe E5 seulement à l'anchor
        example['anchor'] = INSTRUCTION_PREFIX + example['anchor']
        return example

    dataset = dataset.map(add_instruction)
    print(f"Exemple formaté : {dataset[0]['anchor']}")
    return dataset

# Assurez-vous d'avoir le fichier jsonl au bon endroit
train_dataset = prepare_dataset("bercy_train_90.jsonl")

# ==========================================
# 3. MODÈLE & CONFIGURATION 1 (LoRA)
# ==========================================

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# --- CONFIGURATION 1 : K + Q ---
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key"], # Cible stricte Config 1
)

# Application de LoRA
# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 1) ---")
model[0].auto_model.print_trainable_parameters()

# ==========================================
# 4. ENTRAÎNEMENT (MNRL)
# ==========================================

# Loss "In-batch Negatives"
train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning ---")
trainer.train()

# ==========================================
# 5. SAUVEGARDE ET FUSION (MERGE)
# ==========================================

print("\n--- Sauvegarde de l'adaptateur LoRA seul ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print(f"Adaptateur sauvegardé dans : {OUTPUT_DIR_LORA}")

print("\n--- Fusion des poids (Merge) pour Ragas ---")
# 1. On fusionne les poids LoRA (A et B) dans le modèle de base (W)
# merge_and_unload() est une méthode de PEFT qui fait l'opération mathématique finale
# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = model[0].auto_model.merge_and_unload()

# 2. On sauvegarde le modèle complet "standard"
model.save_pretrained(OUTPUT_DIR_MERGED)

print(f"Modèle FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")
print("Prêt pour l'évaluation Ragas !")

Chargement des données depuis bercy_train_90.jsonl...


Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

Exemple formaté : Instruct: Retrieve the meaning or context for the specific administrative acronym or question.
Query: SG
Chargement du modèle de base...

--- Paramètres entraînables (Config 1) ---
trainable params: 1,572,864 || all params: 561,463,296 || trainable%: 0.2801


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:


--- Démarrage du fine-tuning ---
 y


[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.2542
100,3.4564
150,3.2281
200,3.0903
250,3.0036
300,2.963
350,2.9375



--- Sauvegarde de l'adaptateur LoRA seul ---
Adaptateur sauvegardé dans : output/e5_lora_config1_qk_adapter

--- Fusion des poids (Merge) pour Ragas ---
Modèle FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_merged
Prêt pour l'évaluation Ragas !


In [8]:
# Compresser le dossier du modèle fusionné
!zip -r modele_config1_merged.zip final_models/e5_large_finetuned_merged

  adding: final_models/e5_large_finetuned_merged/ (stored 0%)
  adding: final_models/e5_large_finetuned_merged/tokenizer.json (deflated 76%)
  adding: final_models/e5_large_finetuned_merged/tokenizer_config.json (deflated 76%)
  adding: final_models/e5_large_finetuned_merged/special_tokens_map.json (deflated 85%)
  adding: final_models/e5_large_finetuned_merged/config_sentence_transformers.json (deflated 40%)
  adding: final_models/e5_large_finetuned_merged/sentence_bert_config.json (deflated 9%)
  adding: final_models/e5_large_finetuned_merged/model.safetensors (deflated 39%)
  adding: final_models/e5_large_finetuned_merged/config.json (deflated 49%)
  adding: final_models/e5_large_finetuned_merged/README.md (deflated 69%)
  adding: final_models/e5_large_finetuned_merged/sentencepiece.bpe.model (deflated 49%)
  adding: final_models/e5_large_finetuned_merged/1_Pooling/ (stored 0%)
  adding: final_models/e5_large_finetuned_merged/1_Pooling/config.json (deflated 58%)
  adding: final_mode

Configuration 2: K + Q + V (k_proj & q_proj & v_proj) + stratégie in-batch negatives

In [1]:
import torch
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)
from peft import LoraConfig, TaskType, get_peft_model
import os

# ==========================================
# 1. PARAMÈTRES ET CHEMINS (CONFIG 2)
# ==========================================

MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Nouveaux dossiers de sortie pour ne pas écraser la Config 1
OUTPUT_DIR_LORA = "output/e5_lora_config2_qkv_adapter"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config2_merged"

# Hyperparamètres (Identiques à Config 1 pour comparaison équitable)
BATCH_SIZE = 32
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512

# Instruction E5
INSTRUCTION_PREFIX = "Instruct: Retrieve the meaning or context for the specific administrative acronym or question.\nQuery: "

# ==========================================
# 2. PRÉPARATION DES DONNÉES
# ==========================================

def prepare_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_instruction(example):
        example['anchor'] = INSTRUCTION_PREFIX + example['anchor']
        return example

    dataset = dataset.map(add_instruction)
    print(f"Exemple formaté : {dataset[0]['anchor']}")
    return dataset

train_dataset = prepare_dataset("bercy_train_90.jsonl")

# ==========================================
# 3. MODÈLE & CONFIGURATION 2 (LoRA Q+K+V)
# ==========================================

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# --- CONFIGURATION 2 : K + Q + V ---
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    # AJOUT DE "v_proj" ICI
    target_modules=["query", "key", "value"],
)

# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 2 : Q+K+V) ---")
model[0].auto_model.print_trainable_parameters()
# Vous devriez voir environ 50% de paramètres en plus que la Config 1 (~2.3M params)

# ==========================================
# 4. ENTRAÎNEMENT (MNRL - In-batch Negatives)
# ==========================================

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 2) ---")
trainer.train()

# ==========================================
# 5. SAUVEGARDE ET FUSION (MERGE)
# ==========================================

print("\n--- Sauvegarde de l'adaptateur LoRA seul ---")
model.save_pretrained(OUTPUT_DIR_LORA)

print("\n--- Fusion des poids (Merge) pour Ragas ---")
# Fusion
# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = model[0].auto_model.merge_and_unload()

# Sauvegarde finale
model.save_pretrained(OUTPUT_DIR_MERGED)

print(f"Modèle Config 2 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...
Exemple formaté : Instruct: Retrieve the meaning or context for the specific administrative acronym or question.
Query: SG
Chargement du modèle de base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



--- Paramètres entraînables (Config 2 : Q+K+V) ---
trainable params: 2,359,296 || all params: 562,249,728 || trainable%: 0.4196


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 2) ---


[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,3.8142
100,3.0075
150,2.8258
200,2.7333
250,2.6767
300,2.6202
350,2.5825



--- Sauvegarde de l'adaptateur LoRA seul ---

--- Fusion des poids (Merge) pour Ragas ---
Modèle Config 2 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config2_merged


In [2]:
!zip -r modele_config2_merged.zip final_models/e5_large_finetuned_config2_merged

  adding: final_models/e5_large_finetuned_config2_merged/ (stored 0%)
  adding: final_models/e5_large_finetuned_config2_merged/tokenizer.json (deflated 76%)
  adding: final_models/e5_large_finetuned_config2_merged/tokenizer_config.json (deflated 76%)
  adding: final_models/e5_large_finetuned_config2_merged/special_tokens_map.json (deflated 85%)
  adding: final_models/e5_large_finetuned_config2_merged/config_sentence_transformers.json (deflated 40%)
  adding: final_models/e5_large_finetuned_config2_merged/sentence_bert_config.json (deflated 9%)
  adding: final_models/e5_large_finetuned_config2_merged/model.safetensors (deflated 37%)
  adding: final_models/e5_large_finetuned_config2_merged/config.json (deflated 49%)
  adding: final_models/e5_large_finetuned_config2_merged/README.md (deflated 69%)
  adding: final_models/e5_large_finetuned_config2_merged/sentencepiece.bpe.model (deflated 49%)
  adding: final_models/e5_large_finetuned_config2_merged/1_Pooling/ (stored 0%)
  adding: final_mo

Configuration 3:  K + Q (k_proj & q_proj) + stratégie Batchs construits par similarité sémantique (hard negatives implicites)

In [4]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, SequentialSampler
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)
from peft import LoraConfig, TaskType, get_peft_model
import os

# ==========================================
# 1. PARAMÈTRES (CONFIG 3)
# ==========================================

MODEL_ID = "intfloat/multilingual-e5-large-instruct"
OUTPUT_DIR_LORA = "output/e5_lora_config3_hardnegs_adapter"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config3_merged"

# Hyperparamètres
BATCH_SIZE = 32
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512

INSTRUCTION_PREFIX = "Instruct: Retrieve the meaning or context for the specific administrative acronym or question.\nQuery: "

# ==========================================
# 2. PRÉPARATION ET TRI DES DONNÉES
# ==========================================

def prepare_sorted_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    # 1. Ajout de l'instruction
    def add_instruction(example):
        example['anchor'] = INSTRUCTION_PREFIX + example['anchor']
        return example
    dataset = dataset.map(add_instruction)

    # 2. TRI PAR CLUSTER (C'est ici que se joue la stratégie Hard Negatives)
    # En triant par 'cluster_id', on regroupe les concepts proches (ex: tous les 'RH' ensemble)
    # Ainsi, un batch contiendra plusieurs exemples du même cluster, rendant la distinction difficile.
    print("--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---")
    dataset = dataset.sort("cluster_id")

    # Vérification visuelle
    print(f"Exemple 0 (Cluster {dataset[0]['cluster_id']}) : {dataset[0]['anchor']}")
    print(f"Exemple 1 (Cluster {dataset[1]['cluster_id']}) : {dataset[1]['anchor']}")

    return dataset

train_dataset = prepare_sorted_dataset("bercy_train_90.jsonl")

# ==========================================
# 3. CLASS TRAINER SPÉCIFIQUE (NO SHUFFLE)
# ==========================================

class NoShuffleTrainer(SentenceTransformerTrainer):
    """
    Surcharge du Trainer pour empêcher le mélange (shuffle) des données.
    Indispensable pour que la stratégie de batchs sémantiques fonctionne.
    """
    def get_train_dataloader(self):
        # On force un SequentialSampler au lieu du RandomSampler par défaut
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset), # <--- Pas de Shuffle !
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# ==========================================
# 4. MODÈLE & CONFIGURATION 3 (Q + K)
# ==========================================

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# --- CONFIGURATION 3 : K + Q (Comme Config 1) ---
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    # Retour aux cibles standards (Config 3 = Config 1 coté architecture)
    target_modules=["query", "key"],
)

# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 3 : Q+K + Hard Negs) ---")
model[0].auto_model.print_trainable_parameters()

# ==========================================
# 5. ENTRAÎNEMENT
# ==========================================

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

# Utilisation de notre Trainer personnalisé "NoShuffle"
trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 3) ---")
print("ATTENTION : La Loss devrait être plus élevée qu'en Config 1 car la tâche est plus dure !")
trainer.train()

# ==========================================
# 6. SAUVEGARDE ET FUSION
# ==========================================

print("\n--- Sauvegarde et Fusion ---")
model.save_pretrained(OUTPUT_DIR_LORA)
# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = model[0].auto_model.merge_and_unload()
model.save_pretrained(OUTPUT_DIR_MERGED)

print(f"Modèle Config 3 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...


Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

--- Stratégie Config 3 : Tri du dataset par similarité (cluster_id) ---
Exemple 0 (Cluster AAH) : Instruct: Retrieve the meaning or context for the specific administrative acronym or question.
Query: AAH
Exemple 1 (Cluster AAH) : Instruct: Retrieve the meaning or context for the specific administrative acronym or question.
Query: AAH
Chargement du modèle de base...

--- Paramètres entraînables (Config 3 : Q+K + Hard Negs) ---
trainable params: 1,572,864 || all params: 561,463,296 || trainable%: 0.2801


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 3) ---
ATTENTION : La Loss devrait être plus élevée qu'en Config 1 car la tâche est plus dure !


dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.4005
100,3.6324
150,3.4956
200,3.4805
250,3.393
300,3.4896
350,3.3611



--- Sauvegarde et Fusion ---
Modèle Config 3 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config3_merged


In [7]:
!zip -r modele_config3_merged.zip final_models/e5_large_finetuned_config3_merged

  adding: final_models/e5_large_finetuned_config3_merged/ (stored 0%)
  adding: final_models/e5_large_finetuned_config3_merged/tokenizer.json (deflated 76%)
  adding: final_models/e5_large_finetuned_config3_merged/tokenizer_config.json (deflated 76%)
  adding: final_models/e5_large_finetuned_config3_merged/special_tokens_map.json (deflated 85%)
  adding: final_models/e5_large_finetuned_config3_merged/config_sentence_transformers.json (deflated 40%)
  adding: final_models/e5_large_finetuned_config3_merged/sentence_bert_config.json (deflated 9%)
  adding: final_models/e5_large_finetuned_config3_merged/model.safetensors (deflated 39%)
  adding: final_models/e5_large_finetuned_config3_merged/config.json (deflated 49%)
  adding: final_models/e5_large_finetuned_config3_merged/README.md (deflated 68%)
  adding: final_models/e5_large_finetuned_config3_merged/sentencepiece.bpe.model (deflated 49%)
  adding: final_models/e5_large_finetuned_config3_merged/1_Pooling/ (stored 0%)
  adding: final_mo

Configuration 4: K + Q + V (k_proj & q_proj & v_proj) + stratégie Batchs construits par similarité sémantique (hard negatives implicites)

In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, SequentialSampler
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)
from peft import LoraConfig, TaskType, get_peft_model
import os

# ==========================================
# 1. PARAMÈTRES (CONFIG 4)
# ==========================================

MODEL_ID = "intfloat/multilingual-e5-large-instruct"

# Nouveaux dossiers de sortie
OUTPUT_DIR_LORA = "output/e5_lora_config4_qkv_hardnegs_adapter"
OUTPUT_DIR_MERGED = "final_models/e5_large_finetuned_config4_merged"

# Hyperparamètres (Toujours identiques pour la comparaison)
BATCH_SIZE = 32 # Reduced to 16 to prevent OutOfMemoryError
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512

INSTRUCTION_PREFIX = "Instruct: Retrieve the meaning or context for the specific administrative acronym or question.\nQuery: "

# ==========================================
# 2. PRÉPARATION ET TRI DES DONNÉES
# ==========================================

def prepare_sorted_dataset(filepath):
    print(f"Chargement des données depuis {filepath}...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    # 1. Ajout de l'instruction
    def add_instruction(example):
        example['anchor'] = INSTRUCTION_PREFIX + example['anchor']
        return example
    dataset = dataset.map(add_instruction)

    # 2. TRI PAR CLUSTER (STRATÉGIE HARD NEGATIVES)
    # On trie pour que les exemples d'un même domaine se retrouvent dans le même batch
    print("--- Stratégie Config 4 : Tri du dataset par similarité (cluster_id) ---")
    dataset = dataset.sort("cluster_id")

    return dataset

train_dataset = prepare_sorted_dataset("bercy_train_90.jsonl")

# ==========================================
# 3. CLASS TRAINER SPÉCIFIQUE (NO SHUFFLE)
# ==========================================

class NoShuffleTrainer(SentenceTransformerTrainer):
    """
    Indispensable pour la Config 4 :
    Empêche le mélange aléatoire pour conserver l'effet 'Hard Negatives' du tri.
    """
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset), # Ordre séquentiel forcé
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# ==========================================
# 4. MODÈLE & CONFIGURATION 4 (Q + K + V)
# ==========================================

print("Chargement du modèle de base...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=True)
model.max_seq_length = MAX_SEQ_LENGTH

# --- CONFIGURATION 4 : K + Q + V (Comme Config 2) ---
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    # CIBLES ÉTENDUES : Query, Key ET Value
    target_modules=["query", "key", "value"],
)

# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Paramètres entraînables (Config 4 : Q+K+V + Hard Negs) ---")
model[0].auto_model.print_trainable_parameters()
# Doit afficher ~2.3M paramètres (0.69%)

# ==========================================
# 5. ENTRAÎNEMENT
# ==========================================

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    gradient_accumulation_steps=2 # Added to compensate for smaller batch size
)

# Utilisation du Trainer NoShuffle
trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Démarrage du fine-tuning (Config 4) ---")
trainer.train()

# ==========================================
# 6. SAUVEGARDE ET FUSION
# ==========================================

print("\n--- Sauvegarde et Fusion ---")
model.save_pretrained(OUTPUT_DIR_LORA)
# Corrected: Access the underlying model via model[0].auto_model
model[0].auto_model = model[0].auto_model.merge_and_unload()
model.save_pretrained(OUTPUT_DIR_MERGED)

print(f"Modèle Config 4 FUSIONNÉ sauvegardé dans : {OUTPUT_DIR_MERGED}")

Chargement des données depuis bercy_train_90.jsonl...


Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

--- Stratégie Config 4 : Tri du dataset par similarité (cluster_id) ---
Chargement du modèle de base...

--- Paramètres entraînables (Config 4 : Q+K+V + Hard Negs) ---
trainable params: 2,359,296 || all params: 562,249,728 || trainable%: 0.4196


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Démarrage du fine-tuning (Config 4) ---


[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,3.9676
100,3.4174
150,3.2493



--- Sauvegarde et Fusion ---
Modèle Config 4 FUSIONNÉ sauvegardé dans : final_models/e5_large_finetuned_config4_merged


In [3]:
!zip -r modele_config4_merged.zip final_models/e5_large_finetuned_config4_merged

  adding: final_models/e5_large_finetuned_config4_merged/ (stored 0%)
  adding: final_models/e5_large_finetuned_config4_merged/tokenizer.json (deflated 76%)
  adding: final_models/e5_large_finetuned_config4_merged/tokenizer_config.json (deflated 76%)
  adding: final_models/e5_large_finetuned_config4_merged/special_tokens_map.json (deflated 85%)
  adding: final_models/e5_large_finetuned_config4_merged/config_sentence_transformers.json (deflated 40%)
  adding: final_models/e5_large_finetuned_config4_merged/sentence_bert_config.json (deflated 9%)
  adding: final_models/e5_large_finetuned_config4_merged/model.safetensors (deflated 37%)
  adding: final_models/e5_large_finetuned_config4_merged/config.json (deflated 49%)
  adding: final_models/e5_large_finetuned_config4_merged/README.md (deflated 68%)
  adding: final_models/e5_large_finetuned_config4_merged/sentencepiece.bpe.model (deflated 49%)
  adding: final_models/e5_large_finetuned_config4_merged/1_Pooling/ (stored 0%)
  adding: final_mo