# WECHSEL Tutorial

In this tutorial, we will see how to use WECHSEL to transfer a model trained in English to German using Langsfer.

# Imports

In [None]:
%load_ext autoreload

In [None]:
import warnings
from typing import Generator

import datasets
import torch
from transformers import (
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

warnings.simplefilter("ignore")

In [None]:
%autoreload
from langsfer.high_level import wechsel
from langsfer.embeddings import TransformersEmbeddings, FastTextEmbeddings
from langsfer.utils import download_file

# Constants

In [None]:
SOURCE_MODEL_NAME = "roberta-base"
DATASET_NAME = "oscar-corpus/oscar"
DATASET_CONFIG_NAME = "unshuffled_deduplicated_de"
DATASET_SIZE = 10000
TRAIN_DATASET_SIZE = 8000
TRAIN_BATCH_SIZE = 16
EVAL_STEPS = 4000
MAX_TRAIN_STEPS = 16000
SEED = 16

# Dataset

In [None]:
dataset = datasets.load_dataset(
    DATASET_NAME,
    DATASET_CONFIG_NAME,
    split="train",
    streaming=True,
    trust_remote_code=True,
)
dataset = dataset.shuffle(seed=SEED)
dataset = dataset.take(DATASET_SIZE)
train_dataset = dataset.take(TRAIN_DATASET_SIZE)
val_dataset = dataset.skip(TRAIN_DATASET_SIZE)

In [None]:
sample_text = list(val_dataset.skip(10).take(1))[0]["text"]
print(sample_text)

# Embeddings and Tokenizers

In [None]:
source_embeddings = TransformersEmbeddings.from_model_name_or_path(SOURCE_MODEL_NAME)

In [None]:
tokens = source_embeddings.tokenizer.tokenize(sample_text)
print(f"Number of tokens {len(tokens)}, tokens: {tokens}")

We train a new target tokenizer using the same configuration as the source tokenizer using the training dataset 

In [None]:
def batch_iterator(
    dataset: datasets.Dataset, batch_size: int = 1000
) -> Generator[str, None, None]:
    for batch in dataset.iter(batch_size=batch_size):
        yield batch["text"]


target_tokenizer = source_embeddings.tokenizer.train_new_from_iterator(
    batch_iterator(train_dataset), vocab_size=len(source_embeddings.tokenizer)
)

In [None]:
tokens = target_tokenizer.tokenize(sample_text)
print(f"Number of tokens {len(tokens)}, tokens: {tokens}")

We then load pre-trained fasttext embeddings to use as auxiliary embeddings

In [None]:
target_auxiliary_embeddings = FastTextEmbeddings.from_model_name_or_path("en")
source_auxiliary_embeddings = FastTextEmbeddings.from_model_name_or_path("de")

After that, we download a bilinigual dictionary for English and German in order to be able to align the auxiliary embeddings

In [None]:
bilingual_dictionary_file = download_file(
    "https://raw.githubusercontent.com/CPJKU/wechsel/main/dicts/data/german.txt",
    "german.txt",
)

If we open the file and read the first few lines, we can see that it maps English words to their German equivalent.

In [None]:
with bilingual_dictionary_file.open() as f:
    dictionary_lines = [dict([f.readline().strip().split("\t")]) for _ in range(10)]

dictionary_lines

We finally, instantiate the embedding initializer for WECHSEL

In [None]:
embedding_initializer = wechsel(
    source_embeddings=source_embeddings,
    target_tokenizer=target_tokenizer,
    target_auxiliary_embeddings=target_auxiliary_embeddings,
    source_auxiliary_embeddings=source_auxiliary_embeddings,
    bilingual_dictionary_file=bilingual_dictionary_file,
)

And then initialize the target embeddings

In [None]:
target_embeddings = embedding_initializer.initialize(seed=16, show_progress=True)

In [None]:
target_model = AutoModelForCausalLM.from_pretrained(SOURCE_MODEL_NAME)

In [None]:
target_model.get_input_embeddings().weight.data

In [None]:
target_model.get_input_embeddings().weight.data.shape

In [None]:
# Resize its embedding layer
target_model.resize_token_embeddings(len(target_tokenizer))
# Replace the source embeddings matrix with the target embeddings matrix
target_model.get_input_embeddings().weight.data = torch.as_tensor(
    target_embeddings.embeddings_matrix
)

In [None]:
target_model.get_input_embeddings().weight.data

In [None]:
target_model.get_input_embeddings().weight.data.shape

For the sake of comprison, we additionally initialize a similar model but with a random initialization for the embeddings layer.

In [None]:
target_model_from_scratch = AutoModelForCausalLM.from_pretrained(SOURCE_MODEL_NAME)
target_model.get_input_embeddings().weight.data = torch.normal(
    torch.mean(source_embeddings.embeddings_matrix, axis=0),
    torch.std(source_embeddings.embeddings_matrix, axis=0),
    (
        len(target_tokenizer),
        source_embeddings.embeddings_matrix.shape[1],
    ),
)

# Training

## Train validation splits

In [None]:
train_dataset = train_dataset.map(
    lambda x: target_tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=dataset.column_names,
)
train_dataset = train_dataset.with_format("torch")

val_dataset = val_dataset.map(
    lambda x: target_tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=dataset.column_names,
)
val_dataset = val_dataset.with_format("torch")

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=target_tokenizer, mlm=False)

## Training model from scratch

In [None]:
training_args = TrainingArguments(
    output_dir="from_scratch",
    eval_strategy="steps",
    report_to="tensorboard",
    eval_steps=EVAL_STEPS,
    max_steps=MAX_TRAIN_STEPS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    bf16=True,
)

trainer = Trainer(
    model=target_model_from_scratch,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=target_tokenizer,
)

In [None]:
eval_loss = trainer.evaluate()["eval_loss"]
print(f"Evaluation loss before training: {eval_loss:.3f}")

In [None]:
trainer.train()

In [None]:
eval_loss = trainer.evaluate()["eval_loss"]
print(f"Evaluation loss after training: {eval_loss:.3f}")

In [None]:
sample_input_ids = target_tokenizer(sample_text)["input_ids"]
shortened_input_ids = sample_input_ids[: len(sample_input_ids) // 3]

generated_token_ids = (
    trainer.model.generate(
        torch.as_tensor(shortened_input_ids).reshape(1, -1).to(trainer.model.device),
        max_length=300,
    )
    .detach()
    .cpu()
    .numpy()
    .reshape(-1)
)
generated_token_ids = target_tokenizer.decode(
    generated_token_ids, add_special_tokens=False
)
print("Original Text:")
print(sample_text)
print("---")
print("Generated Text:")
print(generated_token_ids)

## Training model with initialized embedding

In [None]:
training_args = TrainingArguments(
    output_dir="initialized_embedding",
    eval_strategy="steps",
    report_to="tensorboard",
    eval_steps=EVAL_STEPS,
    max_steps=MAX_TRAIN_STEPS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    bf16=True,
)

trainer = Trainer(
    model=target_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=target_tokenizer,
)

We evaluate the model before training

In [None]:
eval_loss = trainer.evaluate()["eval_loss"]
print(f"Evaluation loss before training: {eval_loss:.3f}")

We then train the model

In [None]:
trainer.train()

We finally evaluate the model after the training

In [None]:
eval_loss = trainer.evaluate()["eval_loss"]
print(f"Evaluation loss after training: {eval_loss:.3f}")

In [None]:
sample_input_ids = target_tokenizer(sample_text)["input_ids"]
shortened_input_ids = sample_input_ids[: len(sample_input_ids) // 3]

generated_token_ids = (
    trainer.model.generate(
        torch.as_tensor(shortened_input_ids).reshape(1, -1).to(trainer.model.device),
        max_length=300,
    )
    .detach()
    .cpu()
    .numpy()
    .reshape(-1)
)
generated_token_ids = target_tokenizer.decode(
    generated_token_ids, add_special_tokens=False
)
print("Original Text:")
print(sample_text)
print("---")
print("Generated Text:")
print(generated_token_ids)

# Summary

