In [2]:
!pip install transformers torch datasets sentencepiece

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import tarfile

files_to_extract = ["sentences.tar.bz2", "links.tar.bz2"]

for file in files_to_extract:
    print(f"Extracting {file}...")
    try:
        with tarfile.open(file, "r:bz2") as tar:
            tar.extractall()
        print(f" Successfully extracted {file}!")
    except Exception as e:
        print(f" Failed to extract {file}: {e}")


Extracting sentences.tar.bz2...
 Successfully extracted sentences.tar.bz2!
Extracting links.tar.bz2...
 Successfully extracted links.tar.bz2!


In [1]:
import pandas as pd
target_languages = ["kaz", "tel", "kat", "jav", "tgl", "swa", "mal", "mar"]

sentences = pd.read_csv("sentences.csv", sep="\t", names=["sentence_id", "lang", "text"], quoting=3)
links = pd.read_csv("links.csv", sep="\t", names=["source_id", "target_id"])

eng_sentences = sentences[sentences["lang"] == "eng"]

target_sentences = sentences[sentences["lang"].isin(target_languages)]

merged = links.merge(eng_sentences, left_on="source_id", right_on="sentence_id")
merged = merged.merge(target_sentences, left_on="target_id", right_on="sentence_id", suffixes=("_eng", "_target"))

final_dataset = merged[["text_eng", "text_target", "lang_target"]]
final_dataset.columns = ["source_sentence", "target_sentence", "target_lang"]

final_dataset.to_csv("tatoeba_low_resource.csv", index=False)
print("Dataset saved as 'tatoeba_low_resource.csv'")


Loading sentences.csv...
Loading links.csv...
Filtering English sentences...
Filtering target language sentences...
Merging English sentences with translation links...
Merging target language sentences...
                                    source_sentence  \
0                              Let's try something.   
1                              Let's try something.   
2                            I have to go to sleep.   
3                            I have to go to sleep.   
4  Today is June 18th and it is Muiriel's birthday!   

                              target_sentence target_lang  
0                           Subukan natin to.         tgl  
1                               რაღაც ვცადოთ!         kat  
2                  Kailangan ko nang matulog.         tgl  
3                           Matutulog na ako.         tgl  
4  आज १८ जून आहे व आज म्यूरिएलचा वाढदिवस आहे!         mar  
Dataset saved as 'tatoeba_low_resource.csv'


In [None]:
import pandas as pd

final_dataset = pd.read_csv("tatoeba_low_resource.csv")
final_dataset.head(10)

Unnamed: 0,source_sentence,target_sentence,target_lang
0,Let's try something.,Subukan natin to.,tgl
1,Let's try something.,რაღაც ვცადოთ!,kat
2,I have to go to sleep.,Kailangan ko nang matulog.,tgl
3,I have to go to sleep.,Matutulog na ako.,tgl
4,Today is June 18th and it is Muiriel's birthday!,आज १८ जून आहे व आज म्यूरिएलचा वाढदिवस आहे!,mar
5,Muiriel is 20 now.,म्यूरिएल आता २० वर्षांची आहे.,mar
6,"The password is ""Muiriel"".","पासवर्ड ""Muiriel"" आहे.",mar
7,I will be back soon.,मी लवकरच परत येईन.,mar
8,I will be back soon.,मी लवकरच परतेन.,mar
9,I will be back soon.,მალე დავბრუნდები.,kat


In [None]:
!pip install spacy torch



In [None]:
import pandas as pd
import os

df = pd.read_csv("tatoeba_low_resource.csv")

input_dir = "simalign_inputs"
os.makedirs(input_dir, exist_ok=True)

for lang in df["target_lang"].unique():
    file_path = os.path.join(input_dir, f"eng-{lang}.txt")

    lang_pairs = df[df["target_lang"] == lang][["source_sentence", "target_sentence"]]

    lang_pairs.to_csv(file_path, sep="\t", index=False, header=False)
    print(f" Created input file: {file_path}")

 Created input file: simalign_inputs/eng-tgl.txt
 Created input file: simalign_inputs/eng-kat.txt
 Created input file: simalign_inputs/eng-mar.txt
 Created input file: simalign_inputs/eng-kaz.txt
 Created input file: simalign_inputs/eng-jav.txt
 Created input file: simalign_inputs/eng-mal.txt
 Created input file: simalign_inputs/eng-tel.txt


In [3]:
from transformers import AutoModel, AutoTokenizer

model_name = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("XLM-R model loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

XLM-R model loaded successfully!


In [None]:
alignment_file = "simalign_inputs/eng-kaz.txt"

with open(alignment_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i == 4:
            break


I miss you.	Мен сені сағындым.
I'll call them tomorrow when I come back.	Мен ертең үйге келген соң, оларға соғамын.
Hurry up.	Тездет.
I love you.	Мен сені сүйемін.
Congratulations!	Құттықтаймын!


In [4]:
!pip install SimAlign

Collecting SimAlign
  Downloading simalign-0.4-py3-none-any.whl.metadata (6.2 kB)
Downloading simalign-0.4-py3-none-any.whl (8.1 kB)
Installing collected packages: SimAlign
Successfully installed SimAlign-0.4


In [None]:
import os

output_dir = "simalign_outputs"
os.makedirs(output_dir, exist_ok=True)

In [None]:
from simalign import SentenceAligner
import os

aligner = SentenceAligner(model="xlm-roberta-base", matching_methods="mai") 

input_dir = "simalign_inputs"
output_dir = "simalign_outputs"
os.makedirs(output_dir, exist_ok=True)

for file_name in os.listdir(input_dir):
    if file_name.endswith(".txt"):
        src_lang, tgt_lang = file_name.replace(".txt", "").split("-")  # Extract language pair
        input_path = os.path.join(input_dir, file_name)
        output_path = os.path.join(output_dir, f"{src_lang}-{tgt_lang}-alignments.txt")

        print(f"\nProcessing {src_lang} → {tgt_lang}...")

        with open(input_path, "r", encoding="utf-8") as f_in, open(output_path, "w", encoding="utf-8") as f_out:
            for line in f_in:
                parts = line.strip().split("\t")
                if len(parts) != 2:
                    continue

                src_text, tgt_text = parts

                src_tokens = src_text.split()
                tgt_tokens = tgt_text.split()

                alignments = aligner.get_word_aligns(src_tokens, tgt_tokens)

                for src_idx, tgt_idx in alignments["inter"]:
                    f_out.write(f"{src_tokens[src_idx]}\t{tgt_tokens[tgt_idx]}\n")

        print(f"Alignment saved: {output_path}")

2025-03-19 18:56:23,582 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: xlm-roberta-base
INFO:simalign.simalign:Initialized the EmbeddingLoader with model: xlm-roberta-base



Processing eng → kat...
Alignment saved: simalign_outputs/eng-kat-alignments.txt

Processing eng → tgl...
Alignment saved: simalign_outputs/eng-tgl-alignments.txt

Processing eng → kaz...
Alignment saved: simalign_outputs/eng-kaz-alignments.txt

Processing eng → tel...
Alignment saved: simalign_outputs/eng-tel-alignments.txt

Processing eng → mal...
Alignment saved: simalign_outputs/eng-mal-alignments.txt

Processing eng → jav...
Alignment saved: simalign_outputs/eng-jav-alignments.txt

Processing eng → mar...
Alignment saved: simalign_outputs/eng-mar-alignments.txt


In [None]:
alignment_file = "simalign_outputs/eng-kaz-alignments.txt"

with open(alignment_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line.strip()) 
        if i == 9: 
            break

I	Мен
you.	сағындым.
I'll	Мен
them	оларға
tomorrow	ертең
come	келген
back.	соғамын.
Hurry	Тездет.
up.	Тездет.
I	Мен


In [None]:
import os

files_to_extract = ["simalign_outputs.7z"]

for file in files_to_extract:
    print(f"Extracting {file}...")
    try:
        exit_code = os.system(f"7z x {file} -o.")

        if exit_code == 0:
            print(f" Successfully extracted {file}!")
        else:
            print(f" Failed to extract {file}: Error code {exit_code}")
    except Exception as e:
        print(f" Failed to extract {file}: {e}")


Extracting simalign_outputs.7z...
 Successfully extracted simalign_outputs.7z!


In [None]:
alignment_dir = "simalign_outputs"
output_train_file = "alignment_train.tsv"

language_map = {
    "kaz": "kaz", "tel": "tel", "kat": "kat",
    "jav": "jav", "tgl": "tgl", "mal": "mal", "mar": "mar"
}

with open(output_train_file, "w", encoding="utf-8") as train_f:
    for file_name in os.listdir(alignment_dir):
        if file_name.endswith(".txt"):
            src_lang, tgt_lang = file_name.replace("-alignments.txt", "").split("-")
            tgt_lang = language_map.get(tgt_lang, tgt_lang)  

            file_path = os.path.join(alignment_dir, file_name)

            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    src_word, tgt_word = line.strip().split("\t")
                    train_f.write(f"[{src_lang}] {src_word}\t[{tgt_lang}] {tgt_word}\n")

print(f"Training data saved: {output_train_file}")

Training data saved: alignment_train.tsv


In [None]:
with open("alignment_train.tsv", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i == 9:
            break

[eng] Let's	[kat] ვცადოთ!
[eng] something.	[kat] რაღაც
[eng] something.	[kat] ვცადოთ!
[eng] I	[kat] დავბრუნდები.
[eng] back	[kat] დავბრუნდები.
[eng] soon.	[kat] მალე
[eng] soon.	[kat] დავბრუნდები.
[eng] just	[kat] უბრალოდ
[eng] don't	[kat] არ
[eng] know	[kat] ვიცი


In [None]:
import pandas as pd

file_path = "tatoeba_low_resource.csv"
df = pd.read_csv(file_path)

print(df.head())
print("\nColumn Names:", df.columns)
print("\nTotal Rows:", len(df))

                                    source_sentence  \
0                              Let's try something.   
1                              Let's try something.   
2                            I have to go to sleep.   
3                            I have to go to sleep.   
4  Today is June 18th and it is Muiriel's birthday!   

                              target_sentence target_lang  
0                           Subukan natin to.         tgl  
1                               რაღაც ვცადოთ!         kat  
2                  Kailangan ko nang matulog.         tgl  
3                           Matutulog na ako.         tgl  
4  आज १८ जून आहे व आज म्यूरिएलचा वाढदिवस आहे!         mar  

Column Names: Index(['source_sentence', 'target_sentence', 'target_lang'], dtype='object')

Total Rows: 116587


In [None]:
import pandas as pd

file_path = "tatoeba_low_resource.csv"
df = pd.read_csv(file_path)

output_file = "tokenized_sentences.tsv"

with open(output_file, "w", encoding="utf-8") as f_out:
    for _, row in df.iterrows():
        src_sentence = row["source_sentence"]
        tgt_sentence = row["target_sentence"]

        src_tokenized = tokenizer(src_sentence, padding=False, truncation=True, return_tensors="pt")
        tgt_tokenized = tokenizer(tgt_sentence, padding=False, truncation=True, return_tensors="pt")

        src_tokenized_str = " ".join(map(str, src_tokenized["input_ids"].squeeze().tolist()))
        tgt_tokenized_str = " ".join(map(str, tgt_tokenized["input_ids"].squeeze().tolist()))

        f_out.write(src_tokenized_str + "\n")

print(f"Tokenization complete! Saved to {output_file}")

Tokenization complete! Saved to tokenized_sentences.tsv


In [None]:
with open("tokenized_sentences.tsv", "r", encoding="utf-8") as f:
    for i in range(3):
        print(f"Line {i+1}: {f.readline().strip()}")

Line 1: 0 10842 25 7 9790 9844 5 2
Line 2: 0 10842 25 7 9790 9844 5 2
Line 3: 0 87 765 47 738 47 60268 5 2


In [None]:
with open("tokenized_sentences.tsv", "r", encoding="utf-8") as f:
    for i in range(3):
        token_ids = list(map(int, f.readline().strip().split()))
        decoded_text = tokenizer.decode(token_ids)
        print(f"Decoded Sentence {i+1}: {decoded_text}")

Decoded Sentence 1: <s> Let's try something.</s>
Decoded Sentence 2: <s> Let's try something.</s>
Decoded Sentence 3: <s> I have to go to sleep.</s>


In [5]:
alignment_map = {}

with open("alignment_train.tsv", "r", encoding="utf-8") as f:
    for line in f:
        src, tgt = line.strip().split("\t")
        src_word = src.split("] ")[1].strip().lower()  
        tgt_word = tgt.split("] ")[1].strip()

        if src_word:  # Keep all valid words 
            if src_word not in alignment_map:
                alignment_map[src_word] = []
            alignment_map[src_word].append(tgt_word)

In [6]:
alignment_words = []

with open("alignment_train.tsv", "r", encoding="utf-8") as f:
    for line in f:
        src, tgt = line.strip().split("\t")
        src_word = src.split("] ")[1].strip().lower()  
        alignment_words.append(src_word)  # Keep duplicates

missing_words = [word for word in alignment_words if word not in alignment_map]

print(f"Total words in alignment_train_bucc.tsv (including duplicates): {len(alignment_words)}")
print(f"Total words in alignment_map (including duplicates): {len(alignment_map)}")
print(f"Total missing words (including duplicates): {len(missing_words)}")

if missing_words:
    print("Example missing words:", missing_words[:10])

Total words in alignment_train_bucc.tsv (including duplicates): 548855
Total words in alignment_map (including duplicates): 24000
Total missing words (including duplicates): 0


In [None]:
import torch
import random
from transformers import XLMRobertaTokenizerFast


tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

MASK_PROB = 0.2  
def apply_awp_masking(tokenized_input, tokenizer):
    input_ids = tokenized_input["input_ids"].clone()
    labels = torch.full_like(input_ids, -100)  

    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())  
    words = tokenizer.convert_tokens_to_string(tokens).split()  
    mask = torch.zeros_like(input_ids, dtype=torch.bool)

    for i, word in enumerate(words):
        word_lower = word.lower()
        if word_lower in alignment_map and random.random() < MASK_PROB:
            mask[0, i] = True  

    # 80% → Replace with [MASK], 10% → Random token, 10% → Keep original
    rand = torch.rand(input_ids.shape)
    mask_80 = mask & (rand < 0.8)
    mask_10_rand = mask & (rand >= 0.8) & (rand < 0.9)

    input_ids[mask_80] = tokenizer.mask_token_id  

    # Replace 10% of masked tokens with random tokens
    random_tokens = torch.randint(len(tokenizer), input_ids.shape, dtype=torch.long)
    input_ids[mask_10_rand] = random_tokens[mask_10_rand]

    # Replace masked words with aligned words (if available)
    for i, word in enumerate(words):
        word_lower = word.lower()
        if mask[0, i] and word_lower in alignment_map:
            aligned_word = random.choice(alignment_map[word_lower])  
            aligned_tokens = tokenizer.tokenize(aligned_word)

            if aligned_tokens:
                aligned_token_ids = tokenizer.convert_tokens_to_ids(aligned_tokens)

                
                if all(token_id != tokenizer.unk_token_id for token_id in aligned_token_ids):
                    aligned_token_ids = torch.tensor(aligned_token_ids, dtype=torch.long)

                    # Replace labels with aligned word tokens
                    num_tokens_to_replace = min(len(aligned_token_ids), len(labels[0]) - i)
                    labels[0, i:i+num_tokens_to_replace] = aligned_token_ids[:num_tokens_to_replace]
                else:
                    labels[0, i] = tokenizer.mask_token_id  
            else:
                labels[0, i] = -100  

    return input_ids, labels

masked_output_file = "masked_aligned.tsv"

with open("tokenized_sentences.tsv", "r", encoding="utf-8") as f_in, open(masked_output_file, "w", encoding="utf-8") as f_out:
    for line in f_in:
        tokenized_ids = list(map(int, line.strip().split()))
        tokenized_tensor = torch.tensor(tokenized_ids).unsqueeze(0)  

        masked_input_ids, masked_labels = apply_awp_masking({"input_ids": tokenized_tensor}, tokenizer)

        # Convert tensors back to strings
        masked_input_str = " ".join(map(str, masked_input_ids.squeeze().tolist()))
        masked_label_str = " ".join(map(str, masked_labels.squeeze().tolist()))

        f_out.write(masked_input_str + "\t" + masked_label_str + "\n")

print(f"AWP Masking complete! Saved to {masked_output_file}")

AWP Masking complete! Saved to masked_aligned.tsv


In [None]:
file_path = "masked_aligned.tsv"

with open(file_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i == 4:
            break

0 10842 25 7 9790 9844 5 2	-100 -100 -100 -100 -100 -100 -100 -100
0 10842 25 7 9790 9844 5 2	-100 -100 -100 -100 -100 -100 -100 -100
0 38397 185305 47 738 47 60268 5 2	-100 8158 25151 -100 -100 -100 -100 -100 -100
0 87 250001 250001 738 47 60268 5 2	-100 -100 25151 621 254 -100 -100 -100 -100
0 38396 83 18237 543 927 250001 442 83 2758 14 159053 25 7 101207 38 2	-100 -100 -100 -100 -100 -100 908 408 395 -100 -100 -100 -100 -100 -100 -100 -100


In [None]:
import torch
import torch.nn.functional as F

def get_word_embedding(word):
    """
    Extracts the hidden state embedding for a single word.
    """
    inputs = tokenizer(word, return_tensors="pt", padding="max_length", truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state  

    return last_hidden_state[0, 1, :]  

def word_translation_ranking_loss(src_embeddings, tgt_embeddings, temperature=0.05):
    """
    Computes contrastive loss for word embeddings.
    """
    similarity_matrix = torch.mm(src_embeddings, tgt_embeddings.T)
    similarity_matrix /= temperature

    loss = F.cross_entropy(similarity_matrix, torch.arange(similarity_matrix.size(0)).to(similarity_matrix.device))

    return loss

In [None]:
def get_sentence_embedding(sentence):
    """
    Extracts the sentence embedding from XLM-R.
    """
    inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state  

    # Use [CLS] token's embedding as the sentence representation
    return last_hidden_state[:, 0, :]  

def translation_ranking_loss(src_embeddings, tgt_embeddings, temperature=0.05):
    """
    Computes contrastive loss for sentence embeddings.
    """
    similarity_matrix = torch.mm(src_embeddings, tgt_embeddings.T)
    similarity_matrix /= temperature

    loss = F.cross_entropy(similarity_matrix, torch.arange(similarity_matrix.size(0)).to(similarity_matrix.device))

    return loss


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch.nn.functional as F
import gc

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

model.to("cuda")

max_seq_length = 32
batch_size = 8
GPU_MEMORY_THRESHOLD = 11.5

alignment_map = {}
with open("alignment_train.tsv", "r", encoding="utf-8") as f:
    for line in f:
        src, tgt = line.strip().split("\t")
        src_word = src.split("] ")[1]
        tgt_word = tgt.split("] ")[1]
        alignment_map[src_word] = tgt_word

class MaskedAlignedDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.data = []
        self.tokenizer = tokenizer

        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                masked_input, masked_labels = line.strip().split("\t")
                input_ids = list(map(int, masked_input.split()))
                label_ids = list(map(int, masked_labels.split()))

                # Ensure sequences have the same length
                input_ids = input_ids[:max_seq_length] + [tokenizer.pad_token_id] * (max_seq_length - len(input_ids))
                label_ids = label_ids[:max_seq_length] + [-100] * (max_seq_length - len(label_ids))

                attention_mask = [1 if token_id != tokenizer.pad_token_id else 0 for token_id in input_ids]

                aligned_word_mask = torch.zeros(max_seq_length, dtype=torch.long)
                for i, token_id in enumerate(input_ids):
                    token_str = tokenizer.decode([token_id]).strip()
                    if token_str in alignment_map:
                        aligned_word_mask[i] = 1

                self.data.append((input_ids, label_ids, aligned_word_mask, attention_mask))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, label_ids, aligned_word_mask, attention_mask = self.data[idx]
        return (
            torch.tensor(input_ids),
            torch.tensor(label_ids),
            aligned_word_mask.clone().detach(),
            torch.tensor(attention_mask),
        )

dataset = MaskedAlignedDataset("masked_aligned.tsv", tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

def awp_loss_function(predictions, labels, aligned_word_mask):
    valid_labels = labels.clone()

    valid_labels[aligned_word_mask == 0] = -100
    valid_labels[labels == -100] = -100

    if (valid_labels != -100).sum() == 0:
        return torch.tensor(1e-6, device=labels.device, dtype=torch.float32, requires_grad=True)

    loss = F.cross_entropy(
        predictions.view(-1, predictions.size(-1)),
        valid_labels.view(-1),
        ignore_index=-100
    )

    return loss.to(torch.float32)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = torch.amp.GradScaler("cuda")

model.gradient_checkpointing_enable()

for epoch in range(2):
    model.train()
    total_loss = 0

    for step, (input_ids, labels, aligned_word_mask, attention_mask) in enumerate(dataloader):
        input_ids, labels, aligned_word_mask, attention_mask = (
            input_ids.to("cuda"),
            labels.to("cuda"),
            aligned_word_mask.to("cuda"),
            attention_mask.to("cuda"),
        )

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask).logits
        logits = F.log_softmax(outputs, dim=-1)  # Apply log softmax for numerical stability

        loss = awp_loss_function(logits, labels, aligned_word_mask)

        if torch.isnan(loss) or torch.isinf(loss):
            continue

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)

        if any(p.grad is not None for p in model.parameters()):
            scaler.step(optimizer)
            scaler.update()

        total_loss += loss.item()

        torch.cuda.synchronize()
        allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
        if allocated_memory > GPU_MEMORY_THRESHOLD:
            torch.cuda.empty_cache()
            gc.collect()

    print(f"Epoch {epoch+1}, AWP Loss: {total_loss / max(1, len(dataloader))}")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, AWP Loss: 4.69418498058271
Epoch 2, AWP Loss: 4.311148664116258


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import gc
from torch.cuda.amp import autocast, GradScaler

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to("cuda")

max_seq_length = 32
batch_size = 8

alignment_map = {}
with open("alignment_train.tsv", "r", encoding="utf-8") as f:
    for line in f:
        src, tgt = line.strip().split("\t")
        src_word = src.split("] ")[1]
        tgt_word = tgt.split("] ")[1]
        alignment_map[src_word] = tgt_word

class MaskedAlignedDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.data = []
        self.tokenizer = tokenizer

        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                masked_input, masked_labels = line.strip().split("\t")
                input_ids = list(map(int, masked_input.split()))
                label_ids = list(map(int, masked_labels.split()))

                # Ensure sequences have the same length
                input_ids = input_ids[:max_seq_length] + [tokenizer.pad_token_id] * (max_seq_length - len(input_ids))
                label_ids = label_ids[:max_seq_length] + [-100] * (max_seq_length - len(label_ids))

                attention_mask = [1 if token_id != tokenizer.pad_token_id else 0 for token_id in input_ids]

                aligned_word_mask = torch.zeros(max_seq_length, dtype=torch.long)
                for i, token_id in enumerate(input_ids):
                    token_str = tokenizer.decode([token_id]).strip()
                    if token_str in alignment_map:
                        aligned_word_mask[i] = 1

                self.data.append((input_ids, label_ids, aligned_word_mask, attention_mask))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, label_ids, aligned_word_mask, attention_mask = self.data[idx]
        return (
            torch.tensor(input_ids),
            torch.tensor(label_ids),
            aligned_word_mask.clone().detach(),
            torch.tensor(attention_mask),
        )

dataset = MaskedAlignedDataset("masked_aligned.tsv", tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
for batch in dataloader:
    print("Input IDs shape:", batch[0].shape)  # (batch_size, max_seq_length)
    print("Labels shape:", batch[1].shape)  
    print("Aligned Word Mask shape:", batch[2].shape)  
    print("Attention Mask shape:", batch[3].shape)  
    break

Input IDs shape: torch.Size([8, 32])
Labels shape: torch.Size([8, 32])
Aligned Word Mask shape: torch.Size([8, 32])
Attention Mask shape: torch.Size([8, 32])


In [None]:
from transformers import XLMRobertaForMaskedLM, XLMRobertaModel

# Source model
xlmr_src = XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-base").to("cuda")  

# Target model
xlmr_tgt = XLMRobertaModel.from_pretrained("xlm-roberta-base").to("cuda")  

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def forward_pass(transformer_model, input_ids, attention_mask):
    """
    Performs a forward pass through the XLM-R model.
    """
    outputs = transformer_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

    #extract hidden states 
    if hasattr(outputs, "hidden_states"):  
        hidden_states = outputs.hidden_states[-1] 
    else:
        hidden_states = outputs.logits  

    # Extract CLS token embedding for TR Loss
    sentence_embedding = hidden_states[:, 0, :]

    # Extract all token embeddings for WTR and AWP
    word_embeddings = hidden_states  

    # Get masked token predictions (for AWP loss) using lm_head
    masked_logits = transformer_model.lm_head(hidden_states) if hasattr(transformer_model, "lm_head") else None

    return sentence_embedding, word_embeddings, masked_logits

In [None]:
import torch.nn.functional as F

def word_translation_ranking_loss(source_embeddings, target_embeddings, temperature=0.05):
    """
    Computes contrastive loss for word embeddings in a batch.
    """
    # Normalize embeddings
    source_embeddings = F.normalize(source_embeddings, p=2, dim=-1)  
    target_embeddings = F.normalize(target_embeddings, p=2, dim=-1)  

    # Compute similarity matrix correctly
    similarity_scores = torch.matmul(source_embeddings, target_embeddings.permute(0, 2, 1)) / temperature  

    # Apply log softmax for numerical stability
    similarity_scores = F.log_softmax(similarity_scores, dim=-1)

    # Identity matrix for correct alignment
    batch_size, seq_len, _ = similarity_scores.shape
    alignment_labels = torch.arange(seq_len).expand(batch_size, -1).to(similarity_scores.device)

    # Apply contrastive loss (InfoNCE)
    loss = F.nll_loss(similarity_scores.view(-1, similarity_scores.size(-1)), alignment_labels.view(-1))
    return loss


def translation_ranking_loss(source_embeddings, target_embeddings, temperature=0.05):
    """
    Computes contrastive loss for sentence embeddings.
    """
    # Normalize sentence embeddings
    source_embeddings = F.normalize(source_embeddings, p=2, dim=-1)
    target_embeddings = F.normalize(target_embeddings, p=2, dim=-1)

    # Compute similarity matrix
    similarity_scores = torch.mm(source_embeddings, target_embeddings.T) / temperature

    # Apply log softmax for numerical stability
    similarity_scores = F.log_softmax(similarity_scores, dim=-1)

    # Identity matrix for correct alignment
    alignment_labels = torch.arange(similarity_scores.size(0)).to(similarity_scores.device)

    # Compute contrastive loss
    loss = F.nll_loss(similarity_scores, alignment_labels)
    return loss


def awp_loss_function(predictions, labels, aligned_word_mask):
    """
    Computes cross-entropy loss for Aligned Word Prediction (AWP).
    """
    # Clone labels to avoid in-place modification
    masked_labels = labels.clone()

    # Ignore tokens that are not aligned
    masked_labels[aligned_word_mask == 0] = -100
    masked_labels[labels == -100] = -100

    # Prevent NaN issues if no words are masked
    if (masked_labels != -100).sum() == 0:
        return torch.tensor(1e-6, device=labels.device, dtype=torch.float32, requires_grad=True)

    # Compute cross-entropy loss
    loss = F.cross_entropy(
        predictions.view(-1, predictions.size(-1)),
        masked_labels.view(-1),
        ignore_index=-100
    )

    return loss.to(torch.float32)

In [None]:
def compute_joint_loss(sentence_emb_src, sentence_emb_tgt, word_emb_src, word_emb_tgt, logits, labels, aligned_mask):
    """
    Computes the combined loss from TR, WTR, and AWP tasks.
    """
    alpha, beta, gamma = 0.8, 0.1, 0.1  # Weighting factors for loss functions

    loss_tr = translation_ranking_loss(sentence_emb_src, sentence_emb_tgt)  
    loss_wtr = word_translation_ranking_loss(word_emb_src, word_emb_tgt)  
    loss_awp = awp_loss_function(logits, labels, aligned_mask) 

    total_loss = alpha * loss_tr + beta * loss_wtr + gamma * loss_awp
    return total_loss

In [None]:
import torch.optim as optim

optimizer = optim.AdamW(list(xlmr_src.parameters()) + list(xlmr_tgt.parameters()), lr=5e-5)

In [None]:
import torch
import torch.cuda.amp as amp
from tqdm import tqdm


num_epochs = 2  
checkpoint_interval = 2000  


scaler = amp.GradScaler()


for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    total_loss = 0
    xlmr_src.train()
    xlmr_tgt.train()

    for step, batch in enumerate(tqdm(dataloader, desc="Training")):
        input_ids, labels, aligned_word_mask, attention_mask = (
            batch[0].to("cuda"),
            batch[1].to("cuda"),
            batch[2].to("cuda"),
            batch[3].to("cuda"),
        )

        optimizer.zero_grad()

        # Forward pass through both models (Source and Target)
        cls_emb_src, word_emb_src, logits_src = forward_pass(xlmr_src, input_ids, attention_mask)
        cls_emb_tgt, word_emb_tgt, _ = forward_pass(xlmr_tgt, input_ids, attention_mask)  

        # Compute the joint loss function (TR + WTR + AWP)
        total_batch_loss = compute_joint_loss(cls_emb_src, cls_emb_tgt, word_emb_src, word_emb_tgt, logits_src, labels, aligned_word_mask)

        # Perform backward pass with mixed precision
        scaler.scale(total_batch_loss).backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(list(xlmr_src.parameters()) + list(xlmr_tgt.parameters()), max_norm=0.5)

  
        scaler.step(optimizer)
        scaler.update()

        total_loss += total_batch_loss.item()

       
        if (step + 1) % checkpoint_interval == 0:
            checkpoint_path = f"wacse_xlmr_checkpoint_step{step + 1}_epoch{epoch + 1}.pth"
            torch.save({
                'epoch': epoch + 1,
                'step': step + 1,
                'model_src_state_dict': xlmr_src.state_dict(),
                'model_tgt_state_dict': xlmr_tgt.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': total_batch_loss.item()
            }, checkpoint_path)
            print(f"Checkpoint saved at {checkpoint_path}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

    torch.save(xlmr_src.state_dict(), f"wacse_xlmr_src_epoch{epoch + 1}.pth")
    torch.save(xlmr_tgt.state_dict(), f"wacse_xlmr_tgt_epoch{epoch + 1}.pth")
    print(f"Model checkpoints saved for epoch {epoch + 1}")

  scaler = amp.GradScaler()



Epoch 1/2


Training:  14%|█▎        | 2000/14574 [18:09<157:27:29, 45.08s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step2000_epoch1.pth


Training:  27%|██▋       | 4000/14574 [35:51<107:15:59, 36.52s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step4000_epoch1.pth


Training:  41%|████      | 6000/14574 [52:39<49:12:33, 20.66s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step6000_epoch1.pth


Training:  55%|█████▍    | 8000/14574 [1:09:12<29:31:53, 16.17s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step8000_epoch1.pth


Training:  69%|██████▊   | 10000/14574 [1:27:39<63:19:59, 49.85s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step10000_epoch1.pth


Training:  82%|████████▏ | 12000/14574 [1:45:51<33:48:24, 47.28s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step12000_epoch1.pth


Training:  96%|█████████▌| 14000/14574 [2:04:35<8:49:50, 55.38s/it]

Checkpoint saved at wacse_xlmr_checkpoint_step14000_epoch1.pth


Training: 100%|██████████| 14574/14574 [2:09:05<00:00,  1.88it/s]


Epoch 1 completed. Average Loss: 1.1940
Model checkpoints saved for epoch 1

Epoch 2/2


Training:   0%|          | 7/14574 [00:03<1:57:19,  2.07it/s]


KeyboardInterrupt: 

In [None]:
import torch
from transformers import XLMRobertaForMaskedLM, XLMRobertaModel

# Load saved models for evaluation
xlmr_src = XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-base").to("cuda")
xlmr_tgt = XLMRobertaModel.from_pretrained("xlm-roberta-base").to("cuda")

xlmr_src.load_state_dict(torch.load("wacse_xlmr_src_epoch1.pth"))
xlmr_tgt.load_state_dict(torch.load("wacse_xlmr_tgt_epoch1.pth"))

xlmr_src.eval()
xlmr_tgt.eval()

print("Trained model loaded for evaluation.")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Trained model loaded for evaluation.


In [None]:
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

def get_sentence_embedding(model, input_ids, attention_mask):
    """
    Extracts sentence embeddings using [CLS] token.
    """
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

    # Extract hidden states correctly
    if hasattr(outputs, "hidden_states"):
        hidden_states = outputs.hidden_states[-1]  
    else:
        hidden_states = outputs.logits  

    return hidden_states[:, 0, :]

import pandas as pd
df = pd.read_csv("tatoeba_low_resource.csv")  
df_test = df.sample(10000)  

source_embeddings = []
target_embeddings = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Encoding Sentences"):
    src_tokens = tokenizer(row["source_sentence"], return_tensors="pt", padding=True, truncation=True).to("cuda")
    tgt_tokens = tokenizer(row["target_sentence"], return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Get sentence embeddings
    src_emb = get_sentence_embedding(xlmr_src, src_tokens["input_ids"], src_tokens["attention_mask"])
    tgt_emb = get_sentence_embedding(xlmr_tgt, tgt_tokens["input_ids"], tgt_tokens["attention_mask"])

    source_embeddings.append(src_emb.cpu().numpy())
    target_embeddings.append(tgt_emb.cpu().numpy())

# Convert to numpy arrays
source_embeddings = np.vstack(source_embeddings)
target_embeddings = np.vstack(target_embeddings)

print("Sentence embeddings generated for evaluation.")

Encoding Sentences: 100%|██████████| 10000/10000 [04:09<00:00, 40.07it/s]


Sentence embeddings generated for evaluation.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between source and target embeddings
similarity_matrix = cosine_similarity(source_embeddings, target_embeddings)

# Compute Top-1 Accuracy (Correct translation should be ranked highest)
correct = 0
for i in range(len(df_test)):
    ranked_indices = np.argsort(similarity_matrix[i])[::-1]  
    if ranked_indices[0] == i:
        correct += 1

retrieval_accuracy = correct / len(df_test)
print(f"Bitext Retrieval Accuracy: {retrieval_accuracy:.4f}")

Bitext Retrieval Accuracy: 0.5850
