In [15]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
import csv
import pandas as pd
from datasets import Dataset

In [5]:
df = pd.read_csv("data/kamus.csv", quotechar='"')
df.head()

Unnamed: 0,osing,indonesian
0,Iro wis madhang?,Kamu sudah makan
1,"Durung, isun pancen arep madhang nang kene","Belum, aku memang mau makan di sini"
2,Riko arep pesen opo?,Kamu mau pesan apa?
3,Aku pesen nasi goreng lan teh anget,Aku pesan nasi goreng dan teh hangat
4,Riko kelendi kabare?,Bagaimana kabarmu?


In [7]:
df_bidirectional = pd.concat([
    pd.DataFrame({'src': '>>osing<< ' + df['indonesian'], 'tgt': df['osing']}),
    pd.DataFrame({'src': '>>indonesian<< ' + df['osing'], 'tgt': df['indonesian']})
], ignore_index=True)

In [10]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_bidirectional, test_size=0.1)

In [11]:
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")

def tokenize_fn(example):
    model_inputs = tokenizer(example['src'], truncation=True, padding='max_length', max_length=64)
    labels = tokenizer(example['tgt'], truncation=True, padding='max_length', max_length=64)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:

train_dataset = Dataset.from_pandas(train_df).map(tokenize_fn, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_fn, batched=True)

# Load base Marian model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-id-en")

Map: 100%|██████████| 3861/3861 [00:00<00:00, 9428.97 examples/s]
Map: 100%|██████████| 429/429 [00:00<00:00, 9819.08 examples/s]


In [18]:
# Training config
training_args = Seq2SeqTrainingArguments(
    output_dir="models",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir='./logs',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

  trainer = Seq2SeqTrainer(


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss




KeyboardInterrupt: 

In [None]:
model.save_pretrained("models/osing-translator")
tokenizer.save_pretrained("models/osing-translator")