## Training

In [19]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
import csv
import pandas as pd
from datasets import Dataset

import logging
logging.basicConfig(level=logging.INFO)

print(torch.__version__)

2.7.1+cu128


In [2]:
!python -m torch.utils.collect_env

Collecting environment information...
PyTorch version: 2.7.1+cu128
Is debug build: False
CUDA used to build PyTorch: 12.8
ROCM used to build PyTorch: N/A

OS: Ubuntu 24.04.2 LTS (x86_64)
GCC version: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
Clang version: 18.1.3 (1ubuntu1)
CMake version: version 3.28.3
Libc version: glibc-2.39

Python version: 3.12.3 (main, Jun 18 2025, 17:59:45) [GCC 13.3.0] (64-bit runtime)
Python platform: Linux-6.11.0-26-generic-x86_64-with-glibc2.39
Is CUDA available: True
CUDA runtime version: 12.8.61
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: GPU 0: NVIDIA GeForce RTX 3060 Ti
Nvidia driver version: 550.144.03
cuDNN version: Probably one of the following:
/usr/lib/x86_64-linux-gnu/libcudnn.so.9.10.2
/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.10.2
/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.10.2
/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.10.2
/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.10.2
/usr/lib/x86_64-li

In [21]:
df = pd.read_csv("data/korpus.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,osing,indonesia
0,0,iro wis madhang,kamu sudah makan
1,1,durung,belum
2,2,isun pancen arep madhang nang kene,aku memang mau makan di sini
3,3,riko arep pesen opo,kamu mau pesan apa
4,4,aku pesen nasi goreng lan teh anget,aku pesan nasi goreng dan teh hangat


In [22]:
df_bidirectional = pd.concat([
    pd.DataFrame({'src': '>>osing<< ' + df['indonesia'], 'tgt': df['osing']}),
    pd.DataFrame({'src': '>>indonesia<< ' + df['osing'], 'tgt': df['indonesia']})
], ignore_index=True)

df_bidirectional["src"] = df_bidirectional["src"].astype(str)
df_bidirectional["tgt"] = df_bidirectional["tgt"].astype(str)

df_bidirectional.head()

Unnamed: 0,src,tgt
0,>>osing<< kamu sudah makan,iro wis madhang
1,>>osing<< belum,durung
2,>>osing<< aku memang mau makan di sini,isun pancen arep madhang nang kene
3,>>osing<< kamu mau pesan apa,riko arep pesen opo
4,>>osing<< aku pesan nasi goreng dan teh hangat,aku pesen nasi goreng lan teh anget


In [23]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_bidirectional, test_size=0.1)

In [24]:
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")

def tokenize_fn(batch):
    model_inputs = tokenizer(batch['src'], truncation=True, padding='max_length', max_length=64)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch['tgt'], truncation=True, padding='max_length', max_length=64)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
train_dataset = Dataset.from_pandas(train_df).map(tokenize_fn, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_fn, batched=True)

# Load base Marian model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-id-en").to("cuda")

Map: 100%|██████████| 12474/12474 [00:00<00:00, 12765.29 examples/s]
Map: 100%|██████████| 1386/1386 [00:00<00:00, 12676.31 examples/s]


In [None]:
# Training config
training_args = Seq2SeqTrainingArguments(
    output_dir="models",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    fp16=True,
    predict_with_generate=True,
    logging_strategy='steps',    # Log every X steps
    logging_steps=100,           # Log every 100 steps
    report_to='tensorboard',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

  trainer = Seq2SeqTrainer(


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.095,0.098915
2,0.0566,0.069307
3,0.0361,0.056398
4,0.0227,0.052887
5,0.016,0.049061
6,0.0126,0.048643
7,0.0099,0.047258
8,0.0079,0.046588
9,0.0061,0.047256
10,0.0048,0.047142


TrainOutput(global_step=15600, training_loss=0.029894565981932176, metrics={'train_runtime': 899.6568, 'train_samples_per_second': 138.653, 'train_steps_per_second': 17.34, 'total_flos': 2114241020559360.0, 'train_loss': 0.029894565981932176, 'epoch': 10.0})

In [20]:
model.save_pretrained("models/osing-translator")
tokenizer.save_pretrained("models/osing-translator")

('models/osing-translator/tokenizer_config.json',
 'models/osing-translator/special_tokens_map.json',
 'models/osing-translator/vocab.json',
 'models/osing-translator/source.spm',
 'models/osing-translator/target.spm',
 'models/osing-translator/added_tokens.json')

## Testing

In [26]:
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer
import torch

model_path="models/osing-translator"

tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path).to("cuda")

In [27]:
from sacrebleu import corpus_bleu
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in labels (which we set as padding) with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU score
    bleu_score = corpus_bleu(decoded_preds, [decoded_labels])
    
    return {"bleu": bleu_score.score}

In [28]:
training_args = Seq2SeqTrainingArguments(
    output_dir="models",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    fp16=True,
    predict_with_generate=True,
    logging_strategy='steps',    # Log every X steps
    logging_steps=100,           # Log every 100 steps
    report_to='tensorboard',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,  # Use your validation dataset
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,  # Use the BLEU function from earlier
)

eval_results = trainer.evaluate()
print(f"BLEU score: {eval_results['eval_bleu']}")

  trainer = Seq2SeqTrainer(


BLEU score: 96.189661770254
