# **Modèle d'embedding : OrdalieTech/Solon-embeddings-large-0.1**

In [5]:
TASK = "Retrieve the definition or context of an administrative acronym or term."
QUERY_PREFIX = "query : "
DOC_PREFIX = ""

In [6]:
import os
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, SequentialSampler

from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)

from peft import LoraConfig, TaskType, get_peft_model
from sentence_transformers.training_args import BatchSamplers

## **Configuration 1 : K + Q (k_proj & q_proj) + stratégie in-batch negatives**

In [7]:
pip install -q sentence-transformers peft datasets torch accelerate transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:
# CONFIG 1 (Solon)

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"
TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config1_qk_bn_adapter"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config1_merged"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query", "key"]


def prepare_dataset(filepath: str):
    print(f"Loading dataset from {filepath} ...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    dataset = dataset.map(add_prefixes)
    print("Example anchor:", dataset[0]["anchor"][:120])
    print("Example positive:", dataset[0]["positive"][:120])
    return dataset


train_dataset = prepare_dataset(TRAIN_FILE)

print("Loading base model ...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Trainable parameters (Solon Config1 v1) ---")
model[0].auto_model.print_trainable_parameters()

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config1 v1) ---")
trainer.train()

print("\n--- Saving LoRA adapter ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print("Saved adapter to:", OUTPUT_DIR_LORA)

print("\n--- Merging LoRA weights into base model ---")
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print("\nMerged Solon model saved to:", OUTPUT_DIR_MERGED)

Loading dataset from bercy_train_90.jsonl ...
Example anchor: query : SG
Example positive: Le SG est le Secrétariat général, structure de pilotage et de support d’un ministère (RH, finances, achats, SI, etc.).
Loading base model ...

--- Trainable parameters (Solon Config1 v1) ---
trainable params: 1,572,864 || all params: 561,463,296 || trainable%: 0.2801


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config1 v1) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,7.0721
100,6.3968



--- Saving LoRA adapter ---
Saved adapter to: adapters/solon_lora_config1_qk_bn_adapter

--- Merging LoRA weights into base model ---

✅ Merged Solon model saved to: final_models/solon_large_finetuned_config1_merged


## **Configuration 1 version 2 : R = 8 & alpha = 16**

In [None]:
# CONFIG 1 (Solon) - v2

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"
TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config1_qk_bn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config1_merged_v2"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query", "key"]


def prepare_dataset(filepath: str):
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    return dataset.map(add_prefixes)


train_dataset = prepare_dataset(TRAIN_FILE)

model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
model[0].auto_model.print_trainable_parameters()

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config1 v2) ---")
trainer.train()

model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print("\nSaved:", OUTPUT_DIR_MERGED)

trainable params: 786,432 || all params: 560,676,864 || trainable%: 0.1403


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config1 v2) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,7.2248
100,7.0665



✅ Saved: final_models/solon_large_finetuned_config1_merged_v2


## **Configuration 2 : K + Q + V (k_proj & q_proj & v_proj) + stratégie in-batch negatives**

In [None]:
# CONFIG 2 (Solon)

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"
TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config2_qkv_bn_adapter"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config2_merged"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query", "key", "value"]


def prepare_dataset(filepath: str):
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    return dataset.map(add_prefixes)


train_dataset = prepare_dataset(TRAIN_FILE)

model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Trainable parameters (Solon Config2 v1) ---")
model[0].auto_model.print_trainable_parameters()

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config2 v1) ---")
trainer.train()

model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print("\nSaved:", OUTPUT_DIR_MERGED)


--- Trainable parameters (Solon Config2 v1) ---
trainable params: 2,359,296 || all params: 562,249,728 || trainable%: 0.4196


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config2 v1) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,4.9045
100,1.0965



✅ Saved: final_models/solon_large_finetuned_config2_merged


## **Configuration 2 version 2 : R = 8 & alpha = 16**

In [None]:
# CONFIG 2 (Solon) - v2

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"
TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config2_qkv_bn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config2_merged_v2"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query", "key", "value"]


def prepare_dataset(filepath: str):
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    return dataset.map(add_prefixes)


train_dataset = prepare_dataset(TRAIN_FILE)

model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
model[0].auto_model.print_trainable_parameters()

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config2 v2) ---")
trainer.train()

model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print("\nSaved:", OUTPUT_DIR_MERGED)

trainable params: 1,179,648 || all params: 561,070,080 || trainable%: 0.2102


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config2 v2) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,6.0229
100,2.6555



✅ Saved: final_models/solon_large_finetuned_config2_merged_v2


## **Configuration 3 :  K + Q (k_proj & q_proj) + stratégie Batchs construits par similarité sémantique (hard negatives implicites)**

In [None]:
# CONFIG 3 (Solon)

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"
TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config3_qk_hn_adapter"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config3_merged"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query", "key"]


def prepare_sorted_dataset(filepath: str):
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(ex):
        a = ex["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    if "cluster_id" in dataset.column_names:
        dataset = dataset.sort("cluster_id")
    else:
        dataset = dataset.sort("domain")

    return dataset


train_dataset = prepare_sorted_dataset(TRAIN_FILE)


class NoShuffleTrainer(SentenceTransformerTrainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )


model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Trainable parameters (Solon Config3 v1) ---")
model[0].auto_model.print_trainable_parameters()

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config3 v1) ---")
trainer.train()

model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print("\nSaved:", OUTPUT_DIR_MERGED)


--- Trainable parameters (Solon Config3 v1) ---
trainable params: 1,572,864 || all params: 561,463,296 || trainable%: 0.2801


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config3 v1) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,9.3754



✅ Saved: final_models/solon_large_finetuned_config3_merged


## **Configuration 3 version 2 : R = 8 & alpha = 16**

In [None]:
# CONFIG 3 (Solon) - v2

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"
TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config3_qk_hn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config3_merged_v2"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query", "key"]


def prepare_sorted_dataset(filepath: str):
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(ex):
        a = ex["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    if "cluster_id" in dataset.column_names:
        dataset = dataset.sort("cluster_id")
    else:
        dataset = dataset.sort("domain")

    return dataset


train_dataset = prepare_sorted_dataset(TRAIN_FILE)


class NoShuffleTrainer(SentenceTransformerTrainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )


model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
model[0].auto_model.print_trainable_parameters()

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config3 v2) ---")
trainer.train()

model.save_pretrained(OUTPUT_DIR_LORA)
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)
model.save(OUTPUT_DIR_MERGED)

print("\nSaved:", OUTPUT_DIR_MERGED)

trainable params: 786,432 || all params: 560,676,864 || trainable%: 0.1403


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config3 v2) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,9.6604



✅ Saved: final_models/solon_large_finetuned_config3_merged_v2


## **Configuration 4 : K + Q + V (k_proj & q_proj & v_proj) + stratégie Batchs construits par similarité sémantique (hard negatives implicites)**

In [None]:
# 1) Config 4

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"

TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config4_qkv_hn_adapter"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config4_merged"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

TARGET_MODULES = ["query", "key", "value"]

# 2) Dataset prep (prefix + dedup + sort)

def prepare_sorted_dataset(filepath: str):
    print(f"Loading dataset from {filepath} ...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(ex):
        a = ex["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    # HARD negatives implicites : group by cluster_id
    if "cluster_id" in dataset.column_names:
        print("Sorting by cluster_id (hard in-batch negatives).")
        dataset = dataset.sort("cluster_id")
        print("Example:", dataset[0]["cluster_id"], dataset[0]["anchor"][:120])
    else:
        print("WARNING: no cluster_id column found. Sorting by domain instead.")
        dataset = dataset.sort("domain")

    return dataset

train_dataset = prepare_sorted_dataset(TRAIN_FILE)

# 3) No-shuffle trainer

class NoShuffleTrainer(SentenceTransformerTrainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# 4) Model + LoRA

print("Loading base model ...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Trainable parameters ---")
model[0].auto_model.print_trainable_parameters()

# 5) Train (MNRL / in-batch negatives)

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    gradient_accumulation_steps=1,
)

trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config4) ---")
trainer.train()

# 6) Save adapter + merge + save merged model

print("\n--- Saving LoRA adapter ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print("Saved adapter to:", OUTPUT_DIR_LORA)

print("\n--- Merging LoRA weights into base model ---")
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

print("--- Saving merged backbone ---")
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)

print("--- Saving tokenizer ---")
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)

print("--- Saving SentenceTransformer packaging ---")
model.save(OUTPUT_DIR_MERGED)

print("\nMerged Solon model saved to:", OUTPUT_DIR_MERGED)

Loading dataset from bercy_train_90.jsonl ...
Sorting by cluster_id (hard in-batch negatives).
Example: AAH query : AAH
Loading base model ...

--- Trainable parameters ---
trainable params: 2,359,296 || all params: 562,249,728 || trainable%: 0.4196


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config4) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,6.3241



--- Saving LoRA adapter ---
Saved adapter to: adapters/solon_lora_config4_qkv_hn_adapter

--- Merging LoRA weights into base model ---
--- Saving merged backbone ---
--- Saving tokenizer ---
--- Saving SentenceTransformer packaging ---

✅ Merged Solon model saved to: final_models/solon_large_finetuned_config4_merged


## **Configuration 4 version 2 : R = 8 & alpha = 16**

In [None]:
# 1) Config 4 - v2

MODEL_ID = "OrdalieTech/SOLON-embeddings-large-0.1"

TRAIN_FILE = "bercy_train_90.jsonl"

OUTPUT_DIR_LORA   = "adapters/solon_lora_config4_qkv_hn_adapter_v2"
OUTPUT_DIR_MERGED = "final_models/solon_large_finetuned_config4_merged_v2"

BATCH_SIZE = 32
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512

LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

TARGET_MODULES = ["query", "key", "value"]

# 2) Dataset prep (prefix + dedup + sort)

def prepare_sorted_dataset(filepath: str):
    print(f"Loading dataset from {filepath} ...")
    dataset = load_dataset("json", data_files=filepath, split="train")

    def add_prefixes(ex):
        ex["anchor"] = QUERY_PREFIX + ex["anchor"]
        ex["positive"] = DOC_PREFIX + ex["positive"]
        if "negative" in ex and ex["negative"]:
            ex["negative"] = DOC_PREFIX + ex["negative"]
        return ex

    dataset = dataset.map(add_prefixes)

    seen = set()
    def keep_first(ex):
        a = ex["anchor"]
        if a in seen:
            return False
        seen.add(a)
        return True

    dataset = dataset.filter(keep_first)

    # HARD negatives implicites: group by cluster_id
    if "cluster_id" in dataset.column_names:
        print("Sorting by cluster_id (hard in-batch negatives).")
        dataset = dataset.sort("cluster_id")
        print("Example:", dataset[0]["cluster_id"], dataset[0]["anchor"][:120])
    else:
        print("WARNING: no cluster_id column found. Sorting by domain instead.")
        dataset = dataset.sort("domain")

    return dataset

train_dataset = prepare_sorted_dataset(TRAIN_FILE)

# 3) No-shuffle trainer

class NoShuffleTrainer(SentenceTransformerTrainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=SequentialSampler(self.train_dataset),
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

# 4) Model + LoRA

print("Loading base model ...")
model = SentenceTransformer(MODEL_ID, trust_remote_code=False)
model.max_seq_length = MAX_SEQ_LENGTH

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

model[0].auto_model = get_peft_model(model[0].auto_model, peft_config)
print("\n--- Trainable parameters ---")
model[0].auto_model.print_trainable_parameters()

# 5) Train (MNRL / in-batch negatives)

train_loss = losses.MultipleNegativesRankingLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR_LORA,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    gradient_accumulation_steps=1,
)

trainer = NoShuffleTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

print("\n--- Starting fine-tuning (Solon Config4 v2) ---")
trainer.train()

# 6) Save adapter + merge + save merged model

print("\n--- Saving LoRA adapter ---")
model.save_pretrained(OUTPUT_DIR_LORA)
print("Saved adapter to:", OUTPUT_DIR_LORA)

print("\n--- Merging LoRA weights into base model ---")
model[0].auto_model = model[0].auto_model.merge_and_unload()

os.makedirs(OUTPUT_DIR_MERGED, exist_ok=True)

print("--- Saving merged backbone ---")
model[0].auto_model.save_pretrained(OUTPUT_DIR_MERGED, safe_serialization=True)

print("--- Saving tokenizer ---")
model.tokenizer.save_pretrained(OUTPUT_DIR_MERGED)

print("--- Saving SentenceTransformer packaging ---")
model.save(OUTPUT_DIR_MERGED)

print("\nMerged Solon model saved to:", OUTPUT_DIR_MERGED)


Loading dataset from bercy_train_90.jsonl ...
Sorting by cluster_id (hard in-batch negatives).
Example: AAH query : AAH
Loading base model ...

--- Trainable parameters ---
trainable params: 1,179,648 || all params: 561,070,080 || trainable%: 0.2102


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


--- Starting fine-tuning (Solon Config4 v2) ---


Column 'anchor' is at index 4, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Step,Training Loss
50,7.7172



--- Saving LoRA adapter ---
Saved adapter to: adapters/solon_lora_config4_qkv_hn_adapter_v2

--- Merging LoRA weights into base model ---
--- Saving merged backbone ---
--- Saving tokenizer ---
--- Saving SentenceTransformer packaging ---

✅ Merged Solon model saved to: final_models/solon_large_finetuned_config4_merged_v2
