# IMPORTS & SEEDS

In [1]:
!pip install sentence-transformers==2.7



In [2]:
!pip install datasets



In [3]:
!pip install python-Levenshtein



In [4]:
import json
import pandas as pd
import torch
import random
import numpy as np
import os
import random


from datasets import Dataset, load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer
from google.colab import files

os.environ["WANDB_DISABLED"] = "true"

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Datasets Loading

## SIB (Topic Classification)

In [5]:
from datasets import load_dataset

# Specify the languages you want to use
languages = languages = ["fra_Latn", "eng_Latn", "deu_Latn", "jpn_Jpan", "zho_Hans", "rus_Cyrl", "ltz_Latn"]

# Load language-specific splits
train_datasets = {}
test_datasets = {}

for lang in languages:
    lang_dataset = load_dataset("mteb/sib200", lang)
    train_datasets[lang] = lang_dataset["train"]
    test_datasets[lang] = lang_dataset["test"]

sib_train_dataset = concatenate_datasets([train_datasets[lang] for lang in languages if lang!='ltz_Latn'])
sib_test_dataset = test_datasets['ltz_Latn']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/205 [00:00<?, ?it/s]

In [6]:
zero_shot_template_labels = ["An dësem Beispill geet et em ", "D'Thema vun dësem Text ass ", "Dëst Dokument beschäftegt sech mat "] # just add label at the end.

category_to_lux_map = {
    "science/technology": "Technologie.",
    "travel": "Reesen.",
    "politics": "Politik.",
    "health": "Gesondheet.",
    "entertainment": "Ennerhalung.",
    "geography": "Geographie.",
    "sports": "Sport."
}

class_to_templates = {}
for category in list(set(sib_train_dataset['category'])):
  class_to_templates[category] = []
  for template in zero_shot_template_labels:
    class_to_templates[category].append(template + category_to_lux_map[category])
  class_to_templates[category].append(category_to_lux_map[category][:-1]) # one template that is just the label, no full stop
  class_to_templates[category].append("Hei gëtt iwwer " + category_to_lux_map[category][:-1] + " geschwat.") # one template that label is inbetween

In [7]:
print(class_to_templates)

{'geography': ['An dësem Beispill geet et em Geographie.', "D'Thema vun dësem Text ass Geographie.", 'Dëst Dokument beschäftegt sech mat Geographie.', 'Geographie', 'Hei gëtt iwwer Geographie geschwat.'], 'entertainment': ['An dësem Beispill geet et em Ennerhalung.', "D'Thema vun dësem Text ass Ennerhalung.", 'Dëst Dokument beschäftegt sech mat Ennerhalung.', 'Ennerhalung', 'Hei gëtt iwwer Ennerhalung geschwat.'], 'science/technology': ['An dësem Beispill geet et em Technologie.', "D'Thema vun dësem Text ass Technologie.", 'Dëst Dokument beschäftegt sech mat Technologie.', 'Technologie', 'Hei gëtt iwwer Technologie geschwat.'], 'health': ['An dësem Beispill geet et em Gesondheet.', "D'Thema vun dësem Text ass Gesondheet.", 'Dëst Dokument beschäftegt sech mat Gesondheet.', 'Gesondheet', 'Hei gëtt iwwer Gesondheet geschwat.'], 'travel': ['An dësem Beispill geet et em Reesen.', "D'Thema vun dësem Text ass Reesen.", 'Dëst Dokument beschäftegt sech mat Reesen.', 'Reesen', 'Hei gëtt iwwer Re

## Paralux ("Paraphrase Identification")

In [8]:
# Load the ParaLux test dataset
paralux_test = load_dataset("fredxlpy/ParaLux", split="test")

# Now you can work with the paralux_test dataset
paralux_data = {}
paralux_data['anchor'] = paralux_test['anchor']
paralux_data['positive'] = paralux_test['paraphrase']
paralux_data['negative'] = paralux_test['not_paraphrase']

## (LUX <-> DE/FR/EN ) Historical Bitext Mining

### Dataset Prepparation (SPLIT and Utilities)

In [9]:
import json
import random
import matplotlib.pyplot as plt
from sentence_transformers import InputExample

def clean_text_from_punctuation(text):
        """
        Cleans text by removing random punctuation and extra whitespace.

        Args:
            text (str): The text to clean.

        Returns:
            str: Cleaned text containing only alphanumeric characters and preserving spaces.
        """
        return re.sub(r'[^a-zA-Z0-9\s]+', '', text).strip().lower()

def set_random_seed(seed):
    """
    Sets the random seed for reproducibility.

    Args:
        seed (int): The seed value to set.
    """
    random.seed(seed)

def parse_parallel_sentences(file_path):
    """
    Parses a JSONL file containing parallel sentences and extracts the data line by line.

    Args:
        file_path (str): Path to the JSONL file containing parallel sentences.

    Returns:
        list: A list of dictionaries, where each dictionary represents a line from the file.
    """
    lines = []
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            lines.append(data)
    return lines

def split_dataset_by_custom_ids_reproducible(lines, test_ratio=0.1, seed=42, training_ids=None, test_ids=None):
    """
    Splits the dataset into training and test sets by shuffling identifiers deterministically.
    Should always produce the same split as long as the custom_ids present are always the same.

    Args:
        lines (list): A list of items, where each item is a dictionary containing a 'custom_id'.
        test_ratio (float, optional): Proportion of data to be used for the test set. Defaults to 0.1.
        seed (int, optional): Seed value for random shuffling. Ensures reproducibility. Defaults to 42.

    Returns:
        tuple: Two lists - training set and test set.
    """
    # Extract custom_ids from the lines
    custom_ids = [line['custom_id'] for line in lines]

    # Sort the custom_ids alphabetically
    custom_ids.sort()

    # Set the random seed for reproducibility
    if seed is not None:
        random.seed(seed)

    # Shuffle the sorted custom_ids deterministically
    random.shuffle(custom_ids)

    # Split custom_ids into test and training sets
    if not training_ids and not test_ids:
      # Calculate the split index based on the test ratio
      split_index = int(len(set(custom_ids)) * test_ratio)
      test_ids = set(custom_ids[:split_index])
      training_ids = set(custom_ids[split_index:])

    # Split the lines into test and training sets based on custom_ids
    training_set = [line for line in lines if line['custom_id'] in training_ids]
    test_set = [line for line in lines if line['custom_id'] in test_ids]

    return training_set, test_set

def validate_dataset_splits_across_languages(training_sets, test_sets):
    """
    Validates that the custom_ids in the training and test sets match across multiple runs.
    Additionally, checks if the order of custom_ids is consistent.

    Args:
        training_sets (list of list): List of training sets.
        test_sets (list of list): List of test sets.

    Returns:
        bool: True if all custom_ids match between the corresponding sets, False otherwise.
        Prints a message if the set matches but the order is different.
    """
    if not training_sets or not test_sets:
        return False

    # Extract custom_ids from the first training and test sets
    reference_training_ids = sorted([line['custom_id'] for line in training_sets[0]])
    reference_test_ids =  sorted([line['custom_id'] for line in test_sets[0]])
    # Validate that all subsequent training and test sets match the reference sets
    for i, (training_set, test_set) in enumerate(zip(training_sets[1:], test_sets[1:]), start=1):
        training_ids = sorted([line['custom_id'] for line in training_set])
        test_ids = sorted([line['custom_id'] for line in test_set])

        if set(training_ids) != set(reference_training_ids) or set(test_ids) != set(reference_test_ids):
            print(f"Mismatch in custom_id sets for training/test sets at index {i}. There are {len(set(training_ids)- set(reference_training_ids))} different ids ")
            return False

        if training_ids != reference_training_ids:
            print(f"Order of custom_ids in training set at index {i} is different.")
            return False

        if test_ids != reference_test_ids:
            print(f"Order of custom_ids in test set at index {i} is different.")
            return False

    return True


def save_dataset(dataset, file_path):
    """
    Saves a dataset (list of dictionaries) to a JSONL file.

    Args:
        dataset (list): List of dictionaries to save.
        file_path (str): Path to save the JSONL file.
    """
    with open(file_path, "w") as file:
        for entry in dataset:
            json.dump(entry, file)
            file.write("\n")

def load_dataset_local(file_path):
    """
    Loads a dataset (list of dictionaries) from a JSONL file.

    Args:
        file_path (str): Path to the JSONL file.

    Returns:
        list: List of dictionaries loaded from the file.
    """
    dataset = []
    with open(file_path, "r") as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

def extract_parallel_sentences(lines, src_col, tgt_col):
    """
    Extracts parallel sentence pairs (lb, de) and preserves the custom_id from the JSONL lines.

    Args:
        lines (list): A list of dictionaries, where each dictionary represents a line from the file.

    Returns:
        list: A list of tuples containing sentence pairs (lb_sentence, de_sentence, custom_id).
    """
    parallel_sentences = []
    for data in lines:
        custom_id = data.get('custom_id', None)
        translations = data.get('translation', {})
        for sentence_pair in translations:
            if isinstance(sentence_pair, dict):
                src_sentence = sentence_pair.get(src_col)
                tgt_sentence = sentence_pair.get(tgt_col)
                if type(src_sentence) == list or type(tgt_sentence) == list: # to handle a special case of ['']
                    continue
                if (src_sentence and tgt_sentence)  and (len(clean_text_from_punctuation(src_sentence)) >= 5 and len(clean_text_from_punctuation(tgt_sentence)) >= 5): # must contain atleast 5 alphnum characters
                    parallel_sentences.append((src_sentence, tgt_sentence, custom_id))
    return parallel_sentences

### Load datasets

### Parallel Sentences to Historical Bitext Mining Format Utilities

In [10]:
import re
import Levenshtein

def filter_candidates(source_sentence, candidates):
    """
    Filters candidates to remove those that are too close to the source sentence
    based on character-level similarity and small differences.

    Args:
        source_sentence (str): The source sentence to compare against.
        candidates (list): A list of candidate sentences.

    Returns:
        list: Filtered list of candidates.
    """
    def is_too_similar(s1, s2, similarity_threshold=0.85):
      """
      Compares two strings for similarity using textdistance Levenshtein similarity.
      Filters exact matches and highly similar matches (similarity >= threshold).

      Args:
          s1 (str): First string.
          s2 (str): Second string.
          similarity_threshold (float): Threshold for similarity (default is 0.95).

      Returns:
          bool: True if the sentences are too similar, otherwise False.
      """
      if s1 == s2:
          return True  # Exact match

      clean_s1 = clean_text_from_punctuation(s1)
      clean_s2 = clean_text_from_punctuation(s2)

      # Compute Levenshtein ratio
      similarity = Levenshtein.ratio(clean_s1, clean_s2)
      return similarity >= similarity_threshold

    filtered_candidates = []
    for candidate in candidates:
        if not is_too_similar(source_sentence, candidate):
            filtered_candidates.append(candidate)

    return filtered_candidates

def create_bidirectional_bitext_mining_task(test_set, src_col="lb", tgt_col="de", candidates_from_same_article=False, filter_candidates_by_heuristics=True):
    """
    Creates a binarectional bitext-mining task evaluation set from the test set of parallel sentences.

    Args:
        test_set (list): A list of dictionaries, where each dictionary represents a line from the file.

    Returns:
        tuple: Two lists of dictionaries, one for de->lb and one for lb->de, where each dictionary
               contains a source sentence and a list of candidate sentences (first being the true parallel).
    """
    # Extract all sentences grouped by article (custom_id)
    sentences_by_article = {}
    for entry in test_set:
        custom_id = entry.get("custom_id")
        translations = entry.get("translation", [])
        for pair in translations:
            src_sentence = pair.get(src_col)
            tgt_sentence = pair.get(tgt_col)
            if (src_sentence and tgt_sentence) and (len(clean_text_from_punctuation(src_sentence)) >= 5 and len(clean_text_from_punctuation(tgt_sentence)) >= 5): # must contain atleast 5 alphnum characters
                if custom_id not in sentences_by_article:
                    sentences_by_article[custom_id] = []
                sentences_by_article[custom_id].append((src_sentence, tgt_sentence))
            # 9 sentences skipped in LUX -> DE

    # Create the simplified bitext-mining task structure
    bitext_mining_tasks_src_to_tgt = []
    bitext_mining_tasks_tgt_to_src = []
    all_sentences = [(custom_id, sentence_pair) for custom_id, pairs in sentences_by_article.items() for sentence_pair in pairs]

    for custom_id, pairs in sentences_by_article.items():
        for src_sentence, tgt_sentence in pairs:

            # Collect negative samples for lb->de, lb->fr
            negative_samples_src_to_tgt = []
            for other_custom_id, sentence_pair in all_sentences:
                if candidates_from_same_article or other_custom_id != custom_id:
                    negative_samples_src_to_tgt.append(sentence_pair[1])  # Add de_sentence from other articles

            # Collect negative samples for de->lb
            negative_samples_tgt_to_src = []
            for other_custom_id, sentence_pair in all_sentences:
                if candidates_from_same_article or other_custom_id != custom_id:
                    negative_samples_tgt_to_src.append(sentence_pair[0])  # Add lb_sentence from other articless

            if filter_candidates_by_heuristics:
                src_final_candidates = filter_candidates(source_sentence=src_sentence, candidates=negative_samples_tgt_to_src)
                tgt_final_candidates = filter_candidates(source_sentence=tgt_sentence, candidates=negative_samples_src_to_tgt)
            else:
                src_final_candidates = negative_samples_tgt_to_src
                tgt_final_candidates = negative_samples_src_to_tgt

            src_differences.append(len(negative_samples_tgt_to_src) - len(src_final_candidates))
            tgt_differences.append(len(negative_samples_src_to_tgt) - len(tgt_final_candidates))

            # Create simplified structure for both directions
            bitext_mining_tasks_src_to_tgt.append({
                "source_sentence": src_sentence,
                "candidates": [tgt_sentence] + tgt_final_candidates
            })
            bitext_mining_tasks_tgt_to_src.append({
                "source_sentence": tgt_sentence,
                "candidates": [src_sentence] + src_final_candidates
            })

    return bitext_mining_tasks_src_to_tgt, bitext_mining_tasks_tgt_to_src

### German

In [11]:
# as globals, to make a plot without messing with the function
src_differences = [] # source differences
tgt_differences = [] # tgt differences

# Example usage:
seed_value = 42  # Set your desired seed value
set_random_seed(seed_value)

lb_de_file_path = "hist_lux_de_translations.jsonl"
lb_de_parallel_sentences = parse_parallel_sentences(lb_de_file_path)

# Split into test and training sets
lb_de_training_set, lb_de_test_set = split_dataset_by_custom_ids_reproducible(lb_de_parallel_sentences, test_ratio=0.1)

# Optionally save the test set to a file for evaluation
save_dataset(lb_de_test_set, "lb_de_test_set.jsonl")
save_dataset(lb_de_training_set, "lb_de_training_set.jsonl")

In [12]:
# Generate the simplified bitext-mining task datasets
bitext_mining_data_lb_to_de, bitext_mining_data_de_to_lb = create_bidirectional_bitext_mining_task(lb_de_test_set, src_col="lb", tgt_col="de", candidates_from_same_article=True, filter_candidates_by_heuristics=True)

# Save the bitext-mining data to separate JSONL files
save_dataset(bitext_mining_data_de_to_lb, "bitext_mining_task_de_to_lb.jsonl")
save_dataset(bitext_mining_data_lb_to_de, "bitext_mining_task_lb_to_de.jsonl")

In [13]:
if len(src_differences) > 0:

  more_than_one_filter = sum(src_differences) - len(src_differences)
  print(f" There are {len(src_differences)} Luxembourgish source sentences. {more_than_one_filter} ({round((more_than_one_filter / len(src_differences) * 100), 2)}%) harmful candidates were filtered with heuristics" )

if len(tgt_differences) > 0:

  more_than_one_filter = sum(tgt_differences) - len(tgt_differences)
  print(f" There are {len(tgt_differences)} German source sentences. {more_than_one_filter} ({round((more_than_one_filter / len(tgt_differences) * 100), 2)}%) harmful candidates were filtered with heuristics" )

 There are 2127 Luxembourgish source sentences. 56 (2.63%) harmful candidates were filtered with heuristics
 There are 2127 German source sentences. 58 (2.73%) harmful candidates were filtered with heuristics


In [14]:
# # Analyze and plot the distribution of candidates per source sentence
# def plot_candidate_sentences_distribution(bitext_data, title):
#     candidate_counts = [len(entry["candidates"]) for entry in bitext_data]
#     print("TOTAL Source Sentences : " + str(len(bitext_data)))
#     plt.hist(candidate_counts, bins=range(2160, 2180, 2), edgecolor="black")
#     plt.title(title)
#     plt.xlabel("Cardinality of Candidate Sentences")
#     plt.ylabel("Source Sentences")
#     plt.show()

# # Plot distributions for both tasks
# plot_candidate_sentences_distribution(bitext_mining_data_de_to_lb, "Candidate Sentences for Historical Bitext Mining DE -> LB")
# plot_candidate_sentences_distribution(bitext_mining_data_lb_to_de, "Candidate Sentences for Historical Bitext Mining LB -> DE")

### French

In [15]:
# as globals, to make a plot without messing with the function
src_differences = [] # source differences
tgt_differences = [] # tgt differences

lb_fr_file_path = "hist_lux_fr_translations.jsonl"
lb_fr_parallel_sentences = parse_parallel_sentences(lb_fr_file_path)

# Split into test and training sets
de_training_ids = set([line['custom_id'] for line in lb_de_training_set])
de_test_ids = set([line['custom_id'] for line in lb_de_test_set])
lb_fr_training_set, lb_fr_test_set = split_dataset_by_custom_ids_reproducible(lb_fr_parallel_sentences, test_ratio=0.1, training_ids=de_training_ids, test_ids=de_test_ids)

save_dataset(lb_fr_training_set, "lb_fr_training_set.jsonl")
save_dataset(lb_fr_test_set, "lb_fr_test_set.jsonl")

In [16]:
print(len(lb_fr_parallel_sentences))
print(len(lb_de_parallel_sentences))

2340
2338


In [17]:
# Generate the simplified bitext-mining task datasets
bitext_mining_data_lb_to_fr, bitext_mining_data_fr_to_lb = create_bidirectional_bitext_mining_task(lb_fr_test_set, src_col="lb", tgt_col="fr", candidates_from_same_article=True, filter_candidates_by_heuristics=True)

# Save the bitext-mining data to separate JSONL files
save_dataset(bitext_mining_data_lb_to_fr, "bitext_mining_task_lb_to_fr.jsonl")
save_dataset(bitext_mining_data_fr_to_lb, "bitext_mining_task_fr_to_lb.jsonl")

In [18]:
if len(src_differences) > 0:

  more_than_one_filter = sum(src_differences) - len(src_differences)
  print(f" There are {len(src_differences)} Luxembourgish source sentences. {more_than_one_filter} ({round((more_than_one_filter / len(src_differences) * 100), 2)}%) harmful candidates were filtered with heuristics" )

if len(tgt_differences) > 0:

  more_than_one_filter = sum(tgt_differences) - len(tgt_differences)
  print(f" There are {len(tgt_differences)} French source sentences. {more_than_one_filter} ({round((more_than_one_filter / len(tgt_differences) * 100), 2)}%) harmful candidates were filtered with heuristics" )

 There are 2157 Luxembourgish source sentences. 64 (2.97%) harmful candidates were filtered with heuristics
 There are 2157 French source sentences. 66 (3.06%) harmful candidates were filtered with heuristics


In [19]:
print("Datasets of the two languages are split in the same method: " +str(validate_dataset_splits_across_languages(training_sets = [lb_de_training_set, lb_fr_training_set], test_sets = [lb_de_test_set, lb_fr_test_set])))

Datasets of the two languages are split in the same method: True


### English

In [20]:
# as globals, to make a plot without messing with the function
src_differences = [] # source differences
tgt_differences = [] # tgt differences

lb_en_file_path = "hist_lux_en_translations.jsonl"
lb_en_parallel_sentences = parse_parallel_sentences(lb_en_file_path)

# Split into test and training sets
de_training_ids = set([line['custom_id'] for line in lb_de_training_set])
de_test_ids = set([line['custom_id'] for line in lb_de_test_set])
lb_en_training_set, lb_en_test_set = split_dataset_by_custom_ids_reproducible(lb_en_parallel_sentences, test_ratio=0.1, training_ids=de_training_ids, test_ids=de_test_ids)

save_dataset(lb_en_training_set, "lb_en_training_set.jsonl")
save_dataset(lb_en_test_set, "lb_en_test_set.jsonl")

In [21]:
print(len(lb_fr_parallel_sentences))
print(len(lb_de_parallel_sentences))
print(len(lb_en_parallel_sentences))

2340
2338
2340


In [22]:
# Generate the simplified bitext-mining task datasets
bitext_mining_data_lb_to_en, bitext_mining_data_en_to_lb = create_bidirectional_bitext_mining_task(lb_en_test_set, src_col="lb", tgt_col="en", candidates_from_same_article=True, filter_candidates_by_heuristics=True)

# Save the bitext-mining data to separate JSONL files
save_dataset(bitext_mining_data_lb_to_en, "bitext_mining_task_lb_to_en.jsonl")
save_dataset(bitext_mining_data_en_to_lb, "bitext_mining_task_en_to_lb.jsonl")

In [23]:
if len(src_differences) > 0:

  more_than_one_filter = sum(src_differences) - len(src_differences)
  print(f" There are {len(src_differences)} Luxembourgish source sentences. {more_than_one_filter} ({round((more_than_one_filter / len(src_differences) * 100), 2)}%) harmful candidates were filtered with heuristics" )

if len(tgt_differences) > 0:

  more_than_one_filter = sum(tgt_differences) - len(tgt_differences)
  print(f" There are {len(tgt_differences)} English source sentences. {more_than_one_filter} ({round((more_than_one_filter / len(tgt_differences) * 100), 2)}%) harmful candidates were filtered with heuristics" )

 There are 2105 Luxembourgish source sentences. 70 (3.33%) harmful candidates were filtered with heuristics
 There are 2105 English source sentences. 82 (3.9%) harmful candidates were filtered with heuristics


In [24]:
print("Datasets of the two languages are split in the same method: " +str(validate_dataset_splits_across_languages(training_sets = [lb_de_training_set, lb_en_training_set], test_sets = [lb_de_test_set, lb_en_test_set])))

Datasets of the two languages are split in the same method: True


# Evaluation Functions

In [25]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist

def numpy_cosine_similarity(v1, v2):
    """Compute cosine similarity using NumPy."""
    v1 = np.squeeze(v1)
    v2 = np.squeeze(v2)
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def evaluate_sib_topic_classification_with_templates(model, dataset, class_to_templates):
    """
    Evaluate a model's performance in assigning higher cosine similarity to the correct category.

    Args:
        model: A model object with an `.encode()` method for generating embeddings.
        dataset: The dataset containing examples with 'text' and 'category' fields.
        class_to_templates: A dictionary mapping categories to their list of templates.

    Returns:
        template_performance: A list of performance scores (accuracy) for each template.
        average_performance: The average accuracy across all templates.
    """
    # Precompute template embeddings
    precomputed_template_embeddings = {
        category: model.encode(templates) for category, templates in class_to_templates.items()
    }

    # Placeholder for results
    template_performance = []

    # Get categories and number of templates
    categories = list(class_to_templates.keys())
    num_templates = len(next(iter(class_to_templates.values())))

    # Iterate over templates
    for template_idx in range(num_templates):
        correct_count = 0
        total_count = 0

        for example in dataset:  # Iterate over the dataset
            true_category = example["category"]
            text = example["text"]

            # Generate embedding for the current text
            text_embedding = model.encode([text])

            # Retrieve precomputed template embeddings for the current template index
            template_embeddings = np.array([
                precomputed_template_embeddings[category][template_idx]
                for category in categories
            ])

            # Compute cosine similarities
            similarities = cosine_similarity(text_embedding, template_embeddings).flatten()

            # Identify the predicted category
            predicted_idx = np.argmax(similarities)
            predicted_category = categories[predicted_idx]

            # Check if the predicted category is correct
            if predicted_category == true_category:
                correct_count += 1

            total_count += 1

        # Calculate performance for this template
        accuracy = correct_count / total_count
        template_performance.append(accuracy)

    # Calculate average performance across templates
    average_performance = np.mean(template_performance)

    return template_performance, average_performance


def evaluate_paralux_performance(anchor, positive, negative, model):
    """
    Measures the frequency with which the model's embeddings yield a higher cosine similarity
    between anchor and positive than between anchor and negative examples.

    Parameters:
        anchor (list): List of anchor text strings.
        positive (list): List of positive/paraphrase text strings.
        negative (list): List of negative/not paraphrase text strings.
        model: A model with an `encode` method to generate text embeddings.

    Returns:
        float: The frequency of anchor-positive similarity being higher than anchor-negative similarity.
    """
    # Encode the texts
    anchor_embeddings = model.encode(anchor)
    positive_embeddings = model.encode(positive)
    negative_embeddings = model.encode(negative)

    # Compute cosine similarities
    anchor_positive_sim = cosine_similarity(anchor_embeddings, positive_embeddings).diagonal()
    anchor_negative_sim = cosine_similarity(anchor_embeddings, negative_embeddings).diagonal()

    # Calculate the percentage of samples where positive similarity > negative similarity
    higher_similarity_percentage = round(np.mean(anchor_positive_sim > anchor_negative_sim) * 100, 2)

    return higher_similarity_percentage

def evaluate_historical_bitext_mining_task(bitext_data, model, sentence_embeddings = None):
    """
    Evaluates an embedding model on the bitext-mining task.

    Args:
        bitext_data (list): A list of dictionaries containing the bitext-mining task data.
        model (object): A model with a `.encode` method to compute sentence embeddings.

    Returns:
        float: The accuracy of the model on the bitext-mining task.
    """
    # Precompute embeddings for all unique sentences
    if sentence_embeddings is None:
        unique_sentences = set(
            sentence for entry in bitext_data for sentence in [entry["source_sentence"]] + entry["candidates"]
        )
        sentence_embeddings = {sentence: model.encode(sentence) for sentence in unique_sentences}

    correct_count = 0

    for entry in bitext_data:
        source_embedding = sentence_embeddings[entry["source_sentence"]]
        parallel_embedding = sentence_embeddings[entry["candidates"][0]]
        parallel_similarity = numpy_cosine_similarity([source_embedding], [parallel_embedding])

        # Prepare the embeddings for all candidates
        candidate_embeddings = [sentence_embeddings[candidate] for candidate in entry["candidates"][1:]]

        # Use cdist for one-to-many cosine similarity computation
        similarities = 1 - cdist(candidate_embeddings, source_embedding.reshape(1, -1), metric="cosine").flatten()

        # Check if any similarity exceeds max_similarity
        if np.max(similarities) < parallel_similarity:
            correct_count += 1

    accuracy = round(correct_count / len(bitext_data) * 100,2)
    return accuracy, sentence_embeddings

def run_lux_embeddings_evaluations(
    model,
    model_name,
    sib200_dataset,
    sib200_class_to_templates,
    similarity_data,
    bitext_mining_datasets_de_lb,
    bitext_mining_datasets_fr_lb=None,
    bitext_mining_datasets_en_lb=None,
    run_sib_clf=True,
    run_paralux_clf=True,
    run_bitext_mining=True
):
    """
    Combines template-based model evaluation and similarity frequency measurement.

    Parameters:
        model: A model object with an `.encode()` method for generating embeddings.
        model_name (str): Name of the model for display in the results.
        dataset: The dataset containing examples with 'text' and 'category' fields (required for evaluation).
        class_to_templates: A dictionary mapping categories to their list of templates (required for evaluation).
        similarity_data (dict): A dictionary containing 'anchor', 'positive', and 'negative' lists for similarity check.
        run_evaluation (bool): Whether to run the template-based evaluation.
        run_similarity_check (bool): Whether to run the similarity frequency check.

    Returns:
        dict: A dictionary containing results for evaluation and similarity checks.
    """
    results = {}

    if run_sib_clf:
        template_performance, average_performance = evaluate_sib_topic_classification_with_templates(
            model, sib200_dataset, class_to_templates
        )
        results["Zero Shot SIB"] = round(average_performance * 100,2)
        # print(f"Zero Shot SIB (7 classes) Accuracy for {model_name}: {average_performance*100:.2f}%")

    if run_paralux_clf:
        if not (similarity_data and
                "anchor" in similarity_data and
                "positive" in similarity_data and
                "negative" in similarity_data):
            raise ValueError("A dictionary containing 'anchor', 'positive', and 'negative' lists must be provided for similarity check.")

        paralux_performance = evaluate_paralux_performance(
            similarity_data["anchor"],
            similarity_data["positive"],
            similarity_data["negative"],
            model
        )
        results["PARALux Accuracy"] = paralux_performance
        # print(f"PARALux (300 samples) Accuracy for : {model_name}': {paralux_performance:.2f}%")
    if run_bitext_mining:
        de_lb_accuracy, sentence_embeddings = evaluate_historical_bitext_mining_task(bitext_mining_datasets_de_lb[0], model)
        results["Historical Bitext Mining DE -> LB"] = de_lb_accuracy
        # print(f"DE - > LB Historical Bitext Mining Accuracy (2170 Sentences) for {model_name}: " + str(de_lb_accuracy) + "%")
        lb_de_accuracy, sentence_embeddings = evaluate_historical_bitext_mining_task(bitext_mining_datasets_de_lb[1], model, sentence_embeddings)
        results["Historical Bitext Mining LB -> DE"] = lb_de_accuracy
        # print(f"LB - > DE Historical Bitext Mining Accuracy (2170 Sentences) for {model_name}: "  + str(lb_de_accuracy) + "%")

        fr_lb_accuracy, sentence_embeddings = evaluate_historical_bitext_mining_task(bitext_mining_datasets_fr_lb[0], model)
        results["Historical Bitext Mining FR -> LB"] = fr_lb_accuracy
        # print(f"DE - > LB Historical Bitext Mining Accuracy (471 Sentences) for {model_name}: " + str(fr_lb_accuracy) + "%")
        lb_fr_accuracy, sentence_embeddings = evaluate_historical_bitext_mining_task(bitext_mining_datasets_fr_lb[1], model, sentence_embeddings)
        results["Historical Bitext Mining LB -> FR"] = lb_fr_accuracy

        en_lb_accuracy, sentence_embeddings = evaluate_historical_bitext_mining_task(bitext_mining_datasets_en_lb[0], model)
        results["Historical Bitext Mining EN -> LB"] = en_lb_accuracy
        # print(f"DE - > LB Historical Bitext Mining Accuracy (471 Sentences) for {model_name}: " + str(fr_lb_accuracy) + "%")
        lb_en_accuracy, sentence_embeddings = evaluate_historical_bitext_mining_task(bitext_mining_datasets_en_lb[1], model, sentence_embeddings)
        results["Historical Bitext Mining LB -> EN"] = lb_en_accuracy
        # print(f"LB - > DE Historical Bitext Mining Accuracy (471 Sentences) for {model_name}: "  + str(lb_fr_accuracy) + "%")

    return results

# Evaluate Models

## Off-the-shelf Models

In [26]:
model_name = "sentence-transformers/LaBSE"
model = SentenceTransformer(model_name)
print(model_name)

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


sentence-transformers/LaBSE


In [27]:
print(model_name)
results_detailed = run_lux_embeddings_evaluations(model, model_name, sib_test_dataset, class_to_templates, similarity_data=paralux_data,  bitext_mining_datasets_de_lb=[bitext_mining_data_de_to_lb, bitext_mining_data_lb_to_de], bitext_mining_datasets_fr_lb=[bitext_mining_data_fr_to_lb, bitext_mining_data_lb_to_fr], bitext_mining_datasets_en_lb=[bitext_mining_data_en_to_lb, bitext_mining_data_lb_to_en])
results_detailed

sentence-transformers/LaBSE


{'Zero Shot SIB': 43.24,
 'PARALux Accuracy': 38.14,
 'Historical Bitext Mining DE -> LB': 92.1,
 'Historical Bitext Mining LB -> DE': 95.91,
 'Historical Bitext Mining FR -> LB': 90.77,
 'Historical Bitext Mining LB -> FR': 95.46,
 'Historical Bitext Mining EN -> LB': 94.63,
 'Historical Bitext Mining LB -> EN': 95.91}

In [28]:
model_name = "fredxlpy/LuxEmbedder"
model = SentenceTransformer(model_name)
print(model_name)

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


fredxlpy/LuxEmbedder


In [29]:
print(model_name)
results_detailed = run_lux_embeddings_evaluations(model, model_name, sib_test_dataset, class_to_templates, similarity_data=paralux_data,  bitext_mining_datasets_de_lb=[bitext_mining_data_de_to_lb, bitext_mining_data_lb_to_de], bitext_mining_datasets_fr_lb=[bitext_mining_data_fr_to_lb, bitext_mining_data_lb_to_fr], bitext_mining_datasets_en_lb=[bitext_mining_data_en_to_lb, bitext_mining_data_lb_to_en])
results_detailed

fredxlpy/LuxEmbedder


{'Zero Shot SIB': 65.59,
 'PARALux Accuracy': 52.24,
 'Historical Bitext Mining DE -> LB': 85.52,
 'Historical Bitext Mining LB -> DE': 85.43,
 'Historical Bitext Mining FR -> LB': 84.7,
 'Historical Bitext Mining LB -> FR': 84.28,
 'Historical Bitext Mining EN -> LB': 84.99,
 'Historical Bitext Mining LB -> EN': 85.18}

In [30]:
model_name = "Alibaba-NLP/gte-multilingual-base"
model = SentenceTransformer(model_name, trust_remote_code=True)
print(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.bias', 'classifier.weight'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Alibaba-NLP/gte-multilingual-base


In [31]:
print(model_name)
results_detailed = run_lux_embeddings_evaluations(model, model_name, sib_test_dataset, class_to_templates, similarity_data=paralux_data,  bitext_mining_datasets_de_lb=[bitext_mining_data_de_to_lb, bitext_mining_data_lb_to_de], bitext_mining_datasets_fr_lb=[bitext_mining_data_fr_to_lb, bitext_mining_data_lb_to_fr], bitext_mining_datasets_en_lb=[bitext_mining_data_en_to_lb, bitext_mining_data_lb_to_en])
results_detailed

Alibaba-NLP/gte-multilingual-base


{'Zero Shot SIB': 55.88,
 'PARALux Accuracy': 70.51,
 'Historical Bitext Mining DE -> LB': 87.64,
 'Historical Bitext Mining LB -> DE': 87.45,
 'Historical Bitext Mining FR -> LB': 83.96,
 'Historical Bitext Mining LB -> FR': 83.4,
 'Historical Bitext Mining EN -> LB': 81.05,
 'Historical Bitext Mining LB -> EN': 79.19}

In [32]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)
print(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

sentence-transformers/paraphrase-multilingual-mpnet-base-v2


In [33]:
print(model_name)
results_detailed = run_lux_embeddings_evaluations(model, model_name, sib_test_dataset, class_to_templates, similarity_data=paralux_data,  bitext_mining_datasets_de_lb=[bitext_mining_data_de_to_lb, bitext_mining_data_lb_to_de], bitext_mining_datasets_fr_lb=[bitext_mining_data_fr_to_lb, bitext_mining_data_lb_to_fr], bitext_mining_datasets_en_lb=[bitext_mining_data_en_to_lb, bitext_mining_data_lb_to_en])
results_detailed

sentence-transformers/paraphrase-multilingual-mpnet-base-v2


{'Zero Shot SIB': 24.71,
 'PARALux Accuracy': 26.6,
 'Historical Bitext Mining DE -> LB': 50.49,
 'Historical Bitext Mining LB -> DE': 42.6,
 'Historical Bitext Mining FR -> LB': 50.21,
 'Historical Bitext Mining LB -> FR': 42.42,
 'Historical Bitext Mining EN -> LB': 49.55,
 'Historical Bitext Mining LB -> EN': 40.52}

## Our Adapted Models

In [37]:
model_name = "impresso-project/histlux-gte-multilingual-base"
model = SentenceTransformer(model_name, trust_remote_code=True)
print(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/199 [00:00<?, ?B/s]






README.md:   0%|          | 0.00/7.48k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

impresso-project/histlux-gte-multilingual-base


In [38]:
results_detailed = run_lux_embeddings_evaluations(model, model_name, sib_test_dataset, class_to_templates, similarity_data=paralux_data,  bitext_mining_datasets_de_lb=[bitext_mining_data_de_to_lb, bitext_mining_data_lb_to_de], bitext_mining_datasets_fr_lb=[bitext_mining_data_fr_to_lb, bitext_mining_data_lb_to_fr], bitext_mining_datasets_en_lb=[bitext_mining_data_en_to_lb, bitext_mining_data_lb_to_en])
results_detailed

{'Zero Shot SIB': 62.16,
 'PARALux Accuracy': 62.82,
 'Historical Bitext Mining DE -> LB': 97.98,
 'Historical Bitext Mining LB -> DE': 97.88,
 'Historical Bitext Mining FR -> LB': 96.8,
 'Historical Bitext Mining LB -> FR': 96.85,
 'Historical Bitext Mining EN -> LB': 97.1,
 'Historical Bitext Mining LB -> EN': 97.1}

In [35]:
model_name = "impresso-project/histlux-paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)
print(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/199 [00:00<?, ?B/s]






README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

impresso-project/histlux-paraphrase-multilingual-mpnet-base-v2


In [36]:
print(model_name)
results_detailed = run_lux_embeddings_evaluations(model, model_name, sib_test_dataset, class_to_templates, similarity_data=paralux_data,  bitext_mining_datasets_de_lb=[bitext_mining_data_de_to_lb, bitext_mining_data_lb_to_de], bitext_mining_datasets_fr_lb=[bitext_mining_data_fr_to_lb, bitext_mining_data_lb_to_fr], bitext_mining_datasets_en_lb=[bitext_mining_data_en_to_lb, bitext_mining_data_lb_to_en])
results_detailed

impresso-project/histlux-paraphrase-multilingual-mpnet-base-v2


{'Zero Shot SIB': 59.41,
 'PARALux Accuracy': 80.45,
 'Historical Bitext Mining DE -> LB': 91.77,
 'Historical Bitext Mining LB -> DE': 91.11,
 'Historical Bitext Mining FR -> LB': 90.03,
 'Historical Bitext Mining LB -> FR': 88.6,
 'Historical Bitext Mining EN -> LB': 90.36,
 'Historical Bitext Mining LB -> EN': 88.74}