<a href="https://colab.research.google.com/github/OdysseusPolymetis/pyOdysseus/blob/main/labse_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.evaluation import TranslationEvaluator
from sentence_transformers.readers import InputExample
import argparse
from datetime import datetime
from datasets import Dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
def load_data(data_path):
    """Charge les données depuis un fichier CSV ou TSV."""
    file_extension = os.path.splitext(data_path)[1].lower()

    if file_extension == '.csv':
        return pd.read_csv(data_path)
    elif file_extension == '.tsv':
        return pd.read_csv(data_path, sep='\t')
    else:
        raise ValueError(f"Format de fichier non pris en charge: {file_extension}")

In [32]:
def load_data_as_examples(file_path):
    df = pd.read_csv(file_path)
    print(f"Données chargées depuis {file_path}: {len(df)} paires")

    examples = []
    for _, row in df.iterrows():
        examples.append(InputExample(texts=[row['greek'], row['french']]))

    return df, examples

In [None]:
!pip install sentence-transformers pandas torch datasets

In [36]:
train_path = '/content/train/train.csv'
dev_path = '/content/train/dev.csv'
test_path = '/content/train/test.csv'
output_dir = '/content/output/mon_modele_finetuned'

epochs = 25
batch_size = 128
max_seq_length = 128
evaluation_steps = 1000
use_cuda = True

In [None]:
train_df = load_data(train_path)
print(f"Nombre d'exemples d'entraînement: {len(train_df)}")
print("\nAperçu des données:")
display(train_df.head())

train_df['greek_length'] = train_df['greek'].apply(len)
train_df['french_length'] = train_df['french'].apply(len)

print(f"\nLongueur moyenne des textes grecs: {train_df['greek_length'].mean():.2f} caractères")
print(f"Longueur moyenne des textes français: {train_df['french_length'].mean():.2f} caractères")
print(f"Longueur maximale des textes grecs: {train_df['greek_length'].max()} caractères")
print(f"Longueur maximale des textes français: {train_df['french_length'].max()} caractères")

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
    x_label: grp['french'].value_counts()
    for x_label, grp in _df_7.groupby('greek')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('greek')
_ = plt.ylabel('french')

In [None]:
import wandb
wandb.init(mode="disabled")

In [39]:
model = SentenceTransformer('sentence-transformers/LaBSE')
model.max_seq_length = 128

train_df = pd.read_csv(train_path)
print(f"Données d'entraînement chargées: {len(train_df)} paires")

train_examples = []
for _, row in train_df.iterrows():
    train_examples.append(InputExample(texts=[row['greek'], row['french']]))

output_dir = 'output/finetuned-labse'
os.makedirs(output_dir, exist_ok=True)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)

evaluator = None
if os.path.exists(dev_path):
    dev_df, _ = load_data_as_examples(dev_path)

    evaluator = EmbeddingSimilarityEvaluator(
        sentences1=dev_df['greek'].tolist(),
        sentences2=dev_df['french'].tolist(),
        scores=[1.0] * len(dev_df),
        name='dev-eval',
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(f"Évaluateur créé avec {len(dev_df)} paires de validation")


print("Début du finetuning...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    warmup_steps=100,
    output_path=output_dir,
    show_progress_bar=True
)

print(f"Finetuning terminé. Modèle sauvegardé dans: {output_dir}")

Données d'entraînement chargées: 19371 paires
Données chargées depuis /content/train/dev.csv: 2421 paires
Évaluateur créé avec 2421 paires de validation
Début du finetuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.7437
1000,0.3293
1500,0.2338
2000,0.1831
2500,0.1569
3000,0.1433
3500,0.1349


Finetuning terminé. Modèle sauvegardé dans: output/finetuned-labse


In [40]:
def load_data_as_examples(file_path):
    df = pd.read_csv(file_path)
    print(f"Données chargées depuis {file_path}: {len(df)} paires")

    examples = []
    for _, row in df.iterrows():
        examples.append(InputExample(texts=[row['greek'], row['french']]))

    return df, examples

In [41]:
def evaluate_cross_lingual_retrieval(model, test_df, k_values=[1, 5, 10]):

    print("Démarrage de l'évaluation par récupération bilingue...")

    print("Encodage des phrases grecques...")
    source_embeddings = model.encode(test_df['greek'].tolist(), show_progress_bar=True,
                                     batch_size=32, convert_to_numpy=True)

    print("Encodage des phrases françaises...")
    target_embeddings = model.encode(test_df['french'].tolist(), show_progress_bar=True,
                                     batch_size=32, convert_to_numpy=True)

    print("Calcul des similarités cosinus...")
    similarities = cosine_similarity(source_embeddings, target_embeddings)

    results = {}

    print("\nRésultats de l'évaluation:")
    for k in k_values:
        hits = 0
        for i in range(len(similarities)):
            top_indices = np.argsort(similarities[i])[::-1][:k]
            if i in top_indices:
                hits += 1

        accuracy = hits / len(similarities)
        results[f'top_{k}_accuracy'] = accuracy
        print(f"Top-{k} Accuracy: {accuracy:.4f}")

    diagonal_similarities = np.diagonal(similarities)
    mean_similarity = np.mean(diagonal_similarities)
    results['mean_diagonal_similarity'] = mean_similarity
    print(f"Similarité moyenne des paires correctes: {mean_similarity:.4f}")

    reciprocal_ranks = []
    for i in range(len(similarities)):
        ranks = np.argsort(np.argsort(-similarities[i]))
        reciprocal_ranks.append(1.0 / (ranks[i] + 1))

    mrr = np.mean(reciprocal_ranks)
    results['mean_reciprocal_rank'] = mrr
    print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

    return results

In [42]:
def show_retrieval_examples(model, test_df, num_examples=5):

    sample_indices = np.random.choice(len(test_df), min(num_examples, len(test_df)), replace=False)

    target_sentences = test_df['french'].tolist()
    target_embeddings = model.encode(target_sentences, show_progress_bar=True,
                                     batch_size=32, convert_to_numpy=True)

    print("\nExemples de recherche de traduction:")
    for idx in sample_indices:
        greek_sentence = test_df.iloc[idx]['greek']
        correct_french = test_df.iloc[idx]['french']

        query_embedding = model.encode([greek_sentence], convert_to_numpy=True)[0]

        similarities = cosine_similarity([query_embedding], target_embeddings)[0]

        top3_indices = np.argsort(-similarities)[:3]

        print(f"\nPhrase grecque: {greek_sentence}")
        print(f"Traduction correcte: {correct_french}")
        print("Top 3 des traductions candidates:")
        for i, top_idx in enumerate(top3_indices):
            candidate = test_df.iloc[top_idx]['french']
            sim_score = similarities[top_idx]
            correct_mark = " ✓" if top_idx == idx else ""
            print(f"  {i+1}. ({sim_score:.4f}) {candidate}{correct_mark}")

In [43]:
if os.path.exists(test_path):
    best_model = SentenceTransformer(output_dir)

    test_df, _ = load_data_as_examples(test_path)

    print("\n" + "="*50)
    print("ÉVALUATION DU MODÈLE")
    print("="*50)

    evaluation_results = evaluate_cross_lingual_retrieval(best_model, test_df)

    show_retrieval_examples(best_model, test_df, num_examples=3)

Données chargées depuis /content/train/test.csv: 2422 paires

ÉVALUATION DU MODÈLE
Démarrage de l'évaluation par récupération bilingue...
Encodage des phrases grecques...


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Encodage des phrases françaises...


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Calcul des similarités cosinus...

Résultats de l'évaluation:
Top-1 Accuracy: 0.6763
Top-5 Accuracy: 0.8258
Top-10 Accuracy: 0.8699
Similarité moyenne des paires correctes: 0.6814
Mean Reciprocal Rank (MRR): 0.7442


Batches:   0%|          | 0/76 [00:00<?, ?it/s]


Exemples de recherche de traduction:

Phrase grecque: μὴ ἐξείποι κατιδὼν
Traduction correcte: qu'il ne le dise, le sachant,
Top 3 des traductions candidates:
  1. (0.5003) tu ne sortiras point de là,
  2. (0.4960) ne pouvant naviguer
  3. (0.4677) de-peur-qu’il ne dérobe

Phrase grecque: τὸν κίνδυνον κατειληφότα
Traduction correcte: le danger ayant-surpris
Top 3 des traductions candidates:
  1. (0.7065) le danger ayant-surpris ✓
  2. (0.6531) celui devant racheter
  3. (0.5960) le compagnon amoureux

Phrase grecque: ἐστὶ πεπραγμένον αὐτῷ.
Traduction correcte: n’a été fait à lui.
Top 3 des traductions candidates:
  1. (0.6874) n’a été fait à lui. ✓
  2. (0.6272) elle sera remise à lui ;
  3. (0.5885) était sur lui.
