## Training

In [1]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
import csv
import pandas as pd
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/kamus.csv", quotechar='"')
df.head()

Unnamed: 0,osing,indonesian
0,Iro wis madhang?,Kamu sudah makan
1,"Durung, isun pancen arep madhang nang kene","Belum, aku memang mau makan di sini"
2,Riko arep pesen opo?,Kamu mau pesan apa?
3,Aku pesen nasi goreng lan teh anget,Aku pesan nasi goreng dan teh hangat
4,Riko kelendi kabare?,Bagaimana kabarmu?


In [3]:
df_bidirectional = pd.concat([
    pd.DataFrame({'src': '>>osing<< ' + df['indonesian'], 'tgt': df['osing']}),
    pd.DataFrame({'src': '>>indonesian<< ' + df['osing'], 'tgt': df['indonesian']})
], ignore_index=True)

In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_bidirectional, test_size=0.1)

In [5]:
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")

def tokenize_fn(example):
    model_inputs = tokenizer(example['src'], truncation=True, padding='max_length', max_length=64)
    labels = tokenizer(example['tgt'], truncation=True, padding='max_length', max_length=64)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:

train_dataset = Dataset.from_pandas(train_df).map(tokenize_fn, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_fn, batched=True)

# Load base Marian model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-id-en")

Map: 100%|██████████| 3861/3861 [00:00<00:00, 12609.35 examples/s]
Map: 100%|██████████| 429/429 [00:00<00:00, 12349.99 examples/s]


Error while downloading from https://cdn-lfs.hf.co/Helsinki-NLP/opus-mt-id-en/e71532a9cfa6392e7ac5f725d3c9dc82ff6c5a9701b1a407db6dcc25bf4440ce?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1750268097&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MDI2ODA5N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1pZC1lbi9lNzE1MzJhOWNmYTYzOTJlN2FjNWY3MjVkM2M5ZGM4MmZmNmM1YTk3MDFiMWE0MDdkYjZkY2MyNWJmNDQ0MGNlP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=LFKEoqCt%7EusP-R7iN16akCS2pnm3OCOPdt%7E%7El7Cqd4Gdurpptgyoy2Cl-XJp2cdTfKw5gFO2IZiCcRGCQUVlM8U3wUY0PU2iJVofbrhgvQDKnwQQQ4fSYJphK9woNIK8CyIZKRxi16KlXEs-WypRUUBvGECKiMR20CWh6JYXu39j-l-gwMBVgfpVFNfM7HSM-zBbiS3dlnwq7yL8AOWalJy1mJjuaOcZszqeEAxzABbgdw1vKwfEI1jniG5AUnse72F3j9cLaHQdUHUVqonmdNcaySNyxKYLeQfTGMLDLybAmj4mY53Bc%7EvL-pGadGrg4d8009v2cQg36kKag6EI-g__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSC

In [7]:
# Training config
training_args = Seq2SeqTrainingArguments(
    output_dir="models",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir='./logs',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

  return torch._C._cuda_getDeviceCount() > 0
  trainer = Seq2SeqTrainer(


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5222,0.291898
2,0.2385,0.219548
3,0.1251,0.180258
4,0.0769,0.162918
5,0.0475,0.158777
6,0.0304,0.15212
7,0.0208,0.151856
8,0.0141,0.151311
9,0.0111,0.150808
10,0.0089,0.151016




TrainOutput(global_step=9660, training_loss=0.09964796182531747, metrics={'train_runtime': 6860.051, 'train_samples_per_second': 5.628, 'train_steps_per_second': 1.408, 'total_flos': 654407934935040.0, 'train_loss': 0.09964796182531747, 'epoch': 10.0})

In [10]:
model.save_pretrained("models/osing-translator")
tokenizer.save_pretrained("models/osing-translator")

('models/osing-translator/tokenizer_config.json',
 'models/osing-translator/special_tokens_map.json',
 'models/osing-translator/vocab.json',
 'models/osing-translator/source.spm',
 'models/osing-translator/target.spm',
 'models/osing-translator/added_tokens.json')

## Testing

In [2]:
from transformers import MarianTokenizer, MarianMTModel
import torch

model_path="models/osing-translator"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path).to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def translate(text, lang_tag=">>osing<< "):  # or >>indonesia<<
    input_text = lang_tag + text
    tokens = tokenizer([input_text], return_tensors="pt", padding=True)
    output = model.generate(**tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(translate("apa kabar?", ">>osing<<"))
print(translate("kamu makan nasi", ">>osing<<"))
print(translate("saya mau makan terlebih dahulu", ">>osing<<"))
print(translate("siapa nama kamu?", ">>osing<<"))
print(translate("kamu sudah makan belum?"))
print(translate("Anak yang belum tahu", ">>indonesian<<"))

Riko kelendi kabare?
Riko madhang sego
isun arep madhang dhisik?
Sopo arane riko?
Riko wis madhang saka?
Walaupun, Walaupun, Walaupun, Walaupun, Walaupun, Meskipun muda Tawa unsapa.
