In [None]:
# Install OpenNMT-py 3.x
!pip3 install OpenNMT-py

In [2]:
# Create the YAML configuration file
# On a regular machine, you can create it manually or with nano
# Note here we are using some smaller values because the dataset is small
# For larger datasets, consider increasing: train_steps, valid_steps, warmup_steps, save_checkpoint_steps, keep_checkpoint

config = '''# config.yaml


## Where the samples will be written
save_data: run

# Training files
data:
    corpus_1:
        path_src: en-zh.zh-filtered.zh.subword.train
        path_tgt: en-zh.en-filtered.en.subword.train
        transforms: [filtertoolong]
    valid:
        path_src: en-zh.zh-filtered.zh.subword.dev
        path_tgt: en-zh.en-filtered.en.subword.dev
        transforms: [filtertoolong]

# Vocabulary files, generated by onmt_build_vocab
src_vocab: run/source.vocab
tgt_vocab: run/target.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 10000
tgt_vocab_size: 10000

# Filter out source/target longer than n if [filtertoolong] enabled
src_seq_length: 150
src_seq_length: 150

# Tokenization options
src_subword_model: source.model
tgt_subword_model: target.model

# Where to save the log file and the output models/checkpoints
log_file: train.log
save_model: models/model.fren

# Stop training if it does not imporve after n validations
early_stopping: 4

# Default: 5000 - Save a model checkpoint for each n
save_checkpoint_steps: 1000

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 3435

# Default: 100000 - Train the model to max n steps 
# Increase to 200000 or more for large datasets
# For fine-tuning, add up the required steps to the original steps
train_steps: 3000

# Default: 10000 - Run validation after n steps
valid_steps: 1000

# Default: 4000 - for large datasets, try up to 8000
warmup_steps: 1000
report_every: 100

# Number of GPUs, and IDs of GPUs
world_size: 1
#gpu_ranks: [1]

# Batching
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
# warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

In [None]:
# Find the number of CPUs/cores on the machine
!nproc --all

In [3]:
# Build Vocabulary

# -config: path to your config.yaml file
# -n_sample: use -1 to build vocabulary on all the segment in the training dataset
# -num_threads: change it to match the number of CPUs to run it faster

!onmt_build_vocab -config config.yaml -n_sample -1 -num_threads 7

Corpus corpus_1's weight should be given. We default it to 1 for you.
[2025-03-21 01:38:12,956 INFO] Counter vocab from -1 samples.
[2025-03-21 01:38:12,956 INFO] n_sample=-1: Build vocab on full datasets.
[2025-03-21 01:38:16,602 INFO] Counters src: 4594
[2025-03-21 01:38:16,602 INFO] Counters tgt: 2820


In [None]:
# Check if the GPU is active
!nvidia-smi -L

In [None]:
# Check if the GPU is visable to PyTorch
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

In [None]:
# Train the NMT model
!onmt_train -config config.yaml

[2025-03-21 01:39:11,476 INFO] Parsed 2 corpora from -data.
[2025-03-21 01:39:11,476 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2025-03-21 01:39:11,486 INFO] The first 10 tokens of the vocabs are:['<unk>', '<blank>', '<s>', '</s>', '▁', '。', ',', ':', '?', '.']
[2025-03-21 01:39:11,487 INFO] The decoder start token is: <s>
[2025-03-21 01:39:11,487 INFO] Building model...
[2025-03-21 01:39:12,872 INFO] Switching model to float32 for amp/apex_amp
[2025-03-21 01:39:12,872 INFO] Non quantized layer compute is fp16
[2025-03-21 01:39:12,875 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(4600, 512, padding_idx=1)
        )
        (pe): PositionalEncoding()
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (lin

## Translate

In [None]:
# Translate the "subworded" source file of the test dataset
# Change the model name, if needed.
!onmt_translate -model models/model.fren_step_3000.pt -src en-zh.zh-filtered.zh.subword.test -output en-zh.en-filtered.en.subword.translated -gpu 0 -min_length 1

In [None]:
# Check the first 5 lines of the translation file
!head -n 5 en.translated

In [None]:
# If needed install/update sentencepiece
!pip3 install --upgrade -q sentencepiece

# Desubword the translation file
!python3 ./MT-Preparation/subwording/3-desubword.py ./target.model en.translated

In [None]:
# Desubword the target file (reference) of the test dataset
# Note: You might as well have split files *before* subwording during dataset preperation, 
# but sometimes datasets have tokeniztion issues, so this way you are sure the file is really untokenized.
!python3 ./MT-Preparation/subwording/3-desubword.py ./target.model en-zh.en-filtered.en.subword.test

In [None]:
# Check the first 5 lines of the desubworded translation file
!head -n 5 en.translated.desubword

# Check the first 5 lines of the desubworded reference
!head -n 5 en-zh.en-filtered.en.subword.test.desubword

## Evaluation

In [None]:
# Download the BLEU script
!wget https://raw.githubusercontent.com/ymoslem/MT-Evaluation/main/BLEU/compute-bleu.py

In [None]:
# Install sacrebleu
!pip3 install sacrebleu

In [None]:
# Evaluate the translation (without subwording)
!python3 compute-bleu.py en-zh.en-filtered.en.subword.test.desubword en.translated.desubword