In [9]:
!pip install -q sentence-transformers pandas torch datasets

In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
    losses
)
from sentence_transformers.evaluation import TranslationEvaluator
from sentence_transformers.readers import InputExample
import argparse
from datetime import datetime
from datasets import load_dataset, Dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
train_path = '/content/train.csv'
test_path = '/content/test.csv'
output_dir = '/content/output/mon_modele_finetuned'

epochs = 25
batch_size = 128
max_seq_length = 128
evaluation_steps = 1000
use_cuda = True

In [3]:
train_df = pd.read_csv(train_path, sep=',', encoding='utf-8')
print(f"Nombre d'exemples d'entraînement: {len(train_df)}")
print("\nAperçu des données:")
display(train_df.head())

train_df['latin_length'] = train_df['Latin'].fillna('').apply(len)
train_df['french_length'] = train_df['Français'].fillna('').apply(len)

print(f"\nLongueur moyenne des textes latins: {train_df['latin_length'].mean():.2f} caractères")
print(f"Longueur moyenne des textes français: {train_df['french_length'].mean():.2f} caractères")
print(f"Longueur maximale des textes latins: {train_df['latin_length'].max()} caractères")
print(f"Longueur maximale des textes français: {train_df['french_length'].max()} caractères")

Nombre d'exemples d'entraînement: 1921

Aperçu des données:


Unnamed: 0,Latin,Français
0,"I. Gallia omnis est divisa in tres partes, qua...",La Gaule tout-entière est divisée en trois par...
1,"Omnes hi differunt inter se lingua, institutis...",Tous ceux-ci diffèrent entre eux par la langue...
2,"Flumen Garumna dividit Gallos ab Aquitanis, Ma...",Le fleuve de la Garonne sépare les Gaulois des...
3,"Belgæ sunt fortissimi omnium horum, propterea ...",Les Belges sont les plus braves de tous ceux-c...
4,que pertinent,qui tendent



Longueur moyenne des textes latins: 24.43 caractères
Longueur moyenne des textes français: 33.94 caractères
Longueur maximale des textes latins: 499 caractères
Longueur maximale des textes français: 685 caractères


In [4]:
train_df['Latin'] = train_df['Latin'].fillna('')
train_df['Français'] = train_df['Français'].fillna('')

print(f"Données d'entraînement chargées: {len(train_df)} paires")

# Créez une liste d'InputExample
train_examples = []
for _, row in train_df.iterrows():
    # Ensure both Latin and French are present and non-empty strings
    latin_text = str(row.get('Latin', '')).strip()
    french_text = str(row.get('Français', '')).strip()

    if latin_text and french_text: # Only add if both texts are non-empty
         train_examples.append(InputExample(texts=[latin_text, french_text]))
    else:
        # Optional: print a warning or log if a pair is skipped
        print(f"Skipping row with empty text: Latin='{row.get('Latin')}', Français='{row.get('Français')}'")


# Add a check to ensure all examples have exactly two texts
for i, example in enumerate(train_examples):
    if len(example.texts) != 2:
        raise ValueError(f"InputExample at index {i} does not have exactly 2 texts: {example.texts}")

print(f"Nombre d'InputExamples valides créés: {len(train_examples)}")

# Convertir la liste d'InputExample en datasets.Dataset
# train_dataset = Dataset.from_list([{'texts': example.texts} for example in train_examples])

# Créer le dataset avec des colonnes séparées pour le latin et le français
# Cela facilite le traitement par le trainer pour les paires de phrases
train_dataset = Dataset.from_dict({
    'sentence1': [example.texts[0] for example in train_examples],
    'sentence2': [example.texts[1] for example in train_examples]
})


print(f"Dataset créé avec {len(train_dataset)} exemples et les colonnes: {train_dataset.column_names}")

Données d'entraînement chargées: 1921 paires
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='receptos ad se,admis vers eux (dans leurs rangs)', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='a castris corum:', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Skipping row with empty text: Latin='', Français=''
Nombre d'InputExamples valides créés: 1907
Dataset créé avec 1907 exemples et les colonnes: ['sentence1', 'sentence2']


In [5]:
args = SentenceTransformerTrainingArguments(
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    eval_steps = evaluation_steps,
)

In [6]:
model = SentenceTransformer('sentence-transformers/LaBSE')
model.max_seq_length = 128

train_loss = losses.MultipleNegativesRankingLoss(model)

output_dir = 'output/finetuned-labse'
os.makedirs(output_dir, exist_ok=True)

# evaluator = None
# if os.path.exists(dev_path):
#     dev_df, _ = load_data_as_examples(dev_path)

#     evaluator = EmbeddingSimilarityEvaluator(
#         sentences1=dev_df['greek'].tolist(),
#         sentences2=dev_df['french'].tolist(),
#         scores=[1.0] * len(dev_df),
#         name='dev-eval',
#         batch_size=batch_size,
#         show_progress_bar=True
#     )
#     print(f"Évaluateur créé avec {len(dev_df)} paires de validation")


print("Début du finetuning...")
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)
trainer.train()

print(f"Finetuning terminé. Modèle sauvegardé dans: {output_dir}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Début du finetuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Currently logged in as: [33metienne-ferrandi[0m ([33metienne-ferrandi-ens-de-lyon[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 26.12 MiB is free. Process 56099 has 14.71 GiB memory in use. Of the allocated memory 13.72 GiB is allocated by PyTorch, and 882.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def load_data_as_examples(file_path):
    df = pd.read_csv(file_path)
    print(f"Données chargées depuis {file_path}: {len(df)} paires")

    examples = []
    for _, row in df.iterrows():
        examples.append(InputExample(texts=[row['greek'], row['french']]))

    return df, examples

In [None]:
def evaluate_cross_lingual_retrieval(model, test_df, k_values=[1, 5, 10]):

    print("Démarrage de l'évaluation par récupération bilingue...")

    print("Encodage des phrases grecques...")
    source_embeddings = model.encode(test_df['greek'].tolist(), show_progress_bar=True,
                                     batch_size=32, convert_to_numpy=True)

    print("Encodage des phrases françaises...")
    target_embeddings = model.encode(test_df['french'].tolist(), show_progress_bar=True,
                                     batch_size=32, convert_to_numpy=True)

    print("Calcul des similarités cosinus...")
    similarities = cosine_similarity(source_embeddings, target_embeddings)

    results = {}

    print("\nRésultats de l'évaluation:")
    for k in k_values:
        hits = 0
        for i in range(len(similarities)):
            top_indices = np.argsort(similarities[i])[::-1][:k]
            if i in top_indices:
                hits += 1

        accuracy = hits / len(similarities)
        results[f'top_{k}_accuracy'] = accuracy
        print(f"Top-{k} Accuracy: {accuracy:.4f}")

    diagonal_similarities = np.diagonal(similarities)
    mean_similarity = np.mean(diagonal_similarities)
    results['mean_diagonal_similarity'] = mean_similarity
    print(f"Similarité moyenne des paires correctes: {mean_similarity:.4f}")

    reciprocal_ranks = []
    for i in range(len(similarities)):
        ranks = np.argsort(np.argsort(-similarities[i]))
        reciprocal_ranks.append(1.0 / (ranks[i] + 1))

    mrr = np.mean(reciprocal_ranks)
    results['mean_reciprocal_rank'] = mrr
    print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

    return results

In [None]:
def show_retrieval_examples(model, test_df, num_examples=5):

    sample_indices = np.random.choice(len(test_df), min(num_examples, len(test_df)), replace=False)

    target_sentences = test_df['french'].tolist()
    target_embeddings = model.encode(target_sentences, show_progress_bar=True,
                                     batch_size=32, convert_to_numpy=True)

    print("\nExemples de recherche de traduction:")
    for idx in sample_indices:
        greek_sentence = test_df.iloc[idx]['greek']
        correct_french = test_df.iloc[idx]['french']

        query_embedding = model.encode([greek_sentence], convert_to_numpy=True)[0]

        similarities = cosine_similarity([query_embedding], target_embeddings)[0]

        top3_indices = np.argsort(-similarities)[:3]

        print(f"\nPhrase grecque: {greek_sentence}")
        print(f"Traduction correcte: {correct_french}")
        print("Top 3 des traductions candidates:")
        for i, top_idx in enumerate(top3_indices):
            candidate = test_df.iloc[top_idx]['french']
            sim_score = similarities[top_idx]
            correct_mark = " ✓" if top_idx == idx else ""
            print(f"  {i+1}. ({sim_score:.4f}) {candidate}{correct_mark}")

In [None]:
if os.path.exists(test_path):
    best_model = SentenceTransformer(output_dir)

    test_df, _ = load_data_as_examples(test_path)

    print("\n" + "="*50)
    print("ÉVALUATION DU MODÈLE")
    print("="*50)

    evaluation_results = evaluate_cross_lingual_retrieval(best_model, test_df)

    show_retrieval_examples(best_model, test_df, num_examples=3)

Données chargées depuis /content/train/test.csv: 2422 paires

ÉVALUATION DU MODÈLE
Démarrage de l'évaluation par récupération bilingue...
Encodage des phrases grecques...


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Encodage des phrases françaises...


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Calcul des similarités cosinus...

Résultats de l'évaluation:
Top-1 Accuracy: 0.6763
Top-5 Accuracy: 0.8258
Top-10 Accuracy: 0.8699
Similarité moyenne des paires correctes: 0.6814
Mean Reciprocal Rank (MRR): 0.7442


Batches:   0%|          | 0/76 [00:00<?, ?it/s]


Exemples de recherche de traduction:

Phrase grecque: μὴ ἐξείποι κατιδὼν
Traduction correcte: qu'il ne le dise, le sachant,
Top 3 des traductions candidates:
  1. (0.5003) tu ne sortiras point de là,
  2. (0.4960) ne pouvant naviguer
  3. (0.4677) de-peur-qu’il ne dérobe

Phrase grecque: τὸν κίνδυνον κατειληφότα
Traduction correcte: le danger ayant-surpris
Top 3 des traductions candidates:
  1. (0.7065) le danger ayant-surpris ✓
  2. (0.6531) celui devant racheter
  3. (0.5960) le compagnon amoureux

Phrase grecque: ἐστὶ πεπραγμένον αὐτῷ.
Traduction correcte: n’a été fait à lui.
Top 3 des traductions candidates:
  1. (0.6874) n’a été fait à lui. ✓
  2. (0.6272) elle sera remise à lui ;
  3. (0.5885) était sur lui.
