# **Import Library**

In [None]:
import subprocess
import sys
import os

print("Installing required packages...")
packages = [
    'transformers',
    'peft',
    'evaluate',
    'rouge_score',
    'sacrebleu',
    'sentencepiece',
    'accelerate',
    'bitsandbytes'
]

for package in packages:
    try:
        __import__(package.replace('-', '_'))
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

Installing required packages...
Installing evaluate...
Installing rouge_score...
Installing sacrebleu...
Installing bitsandbytes...


# **Load Dataset**

**Overview Dataset**

In [None]:
print("Loading STIF-Indonesia dataset...")

dataset = load_dataset("haryoaw/stif-indonesia")
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['dev'])
test_df = pd.DataFrame(dataset['test'])

print(f"\nTrain: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
print("\nSample data:")
train_df.head()

Loading STIF-Indonesia dataset...


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

dev.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/214 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/363 [00:00<?, ? examples/s]


Train: 1922 | Val: 214 | Test: 363

Sample data:


Unnamed: 0,informal,formal
0,alhamdulillah stlh libur xxxnumberxxx hari onb...,alhamdulillah setelah libur xxxnumberxxx hari ...
1,selamat sore min . saya mau pesan tiket ka via...,selamat sore admin . saya mau pesan tiket ka v...
2,iya kak terimakasih . tapi tadi sudah datang k...,iya kak terima kasih . tetapi tadi sudah datan...
3,malam min xxxuserxxx xxxuserxxx situs kalian e...,"malam admin xxxuserxxx xxxuserxxx , apakah sit..."
4,"min pembelian token pln apa ada kendala , ini ...","admin , pembelian token pln apa ada kendala ? ..."


# **Preprocess Datasets**

In [None]:
def preprocess_function(examples):
    inputs = examples['informal']
    targets = examples['formal']

    inputs = ["Formalize: " + text for text in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=CONFIG['MAX_INPUT_LENGTH'],
        truncation=True,
        padding=False
    )

    labels = tokenizer(
        text_target=targets,
        max_length=CONFIG['MAX_TARGET_LENGTH'],
        truncation=True,
        padding=False
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# **Modeling and Training**

**Training Configuration**

In [None]:
CONFIG = {
    'MODEL_NAME': "facebook/mbart-large-50",
    'MAX_INPUT_LENGTH': 256,
    'MAX_TARGET_LENGTH': 256,
    'BATCH_SIZE': 8,
    'GRADIENT_ACCUMULATION_STEPS': 2,
    'LEARNING_RATE': 1e-5,
    'NUM_EPOCHS': 10,
    'LORA_DROPOUT': 0.1,
    'WARMUP_RATIO': 0.05,
    'WEIGHT_DECAY': 0.01,
    'LABEL_SMOOTHING': 0.1,
    'NUM_BEAMS': 5,
    'EARLY_STOPPING_PATIENCE': 3
}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_optimized",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=CONFIG['LEARNING_RATE'],
    lr_scheduler_type="cosine",
    per_device_train_batch_size=CONFIG['BATCH_SIZE'],
    per_device_eval_batch_size=CONFIG['BATCH_SIZE'],
    num_train_epochs=CONFIG['NUM_EPOCHS'],
    weight_decay=CONFIG['WEIGHT_DECAY'],
    save_total_limit=3,
    predict_with_generate=True,
    generation_max_length=CONFIG['MAX_TARGET_LENGTH'],
    generation_num_beams=CONFIG['NUM_BEAMS'],
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=50,
    logging_first_step=True,
    warmup_ratio=CONFIG['WARMUP_RATIO'],
    gradient_accumulation_steps=CONFIG['GRADIENT_ACCUMULATION_STEPS'],
    report_to="none",
    dataloader_num_workers=0,
    save_safetensors=True,
    label_smoothing_factor=CONFIG['LABEL_SMOOTHING'],
)

lora_configs = [
    {"r": 8,  "lora_alpha": 16},
    {"r": 16, "lora_alpha": 32},
    {"r": 32, "lora_alpha": 64},
    {"r": 64, "lora_alpha": 128},
]

**Pipeline Training**

In [None]:
# PIPELINE TRAINING
models_results = []

for idx, cfg in enumerate(lora_configs, 1):
    print(f"\n\n=== TRAINING MODEL {idx}: r={cfg['r']}, lora_alpha={cfg['lora_alpha']} ===\n")

    # load model
    print(f"\nLoading model: {CONFIG['MODEL_NAME']}")
    tokenizer = AutoTokenizer.from_pretrained(
        CONFIG['MODEL_NAME'],
        src_lang="id_ID",
        tgt_lang="id_ID"
    )

    model = AutoModelForSeq2SeqLM.from_pretrained(
        CONFIG['MODEL_NAME'],
        dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None
    )

    # lora adapters
    print("\nConfiguring LoRA...")
    lora_config = LoraConfig(
        r=cfg["r"],
        lora_alpha=cfg["lora_alpha"],
        target_modules=[
            "q_proj", "v_proj", "k_proj", "out_proj",
            "fc1", "fc2"
        ],
        lora_dropout=CONFIG['LORA_DROPOUT'],
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # preprocess data
    print("\nPreprocessing datasets...")
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_train = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=train_dataset.column_names,
        desc="Tokenizing train",
        num_proc=4
    )

    tokenized_val = val_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=val_dataset.column_names,
        desc="Tokenizing validation",
        num_proc=4
    )

    tokenized_test = test_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names,
        desc="Tokenizing test",
        num_proc=4
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        label_pad_token_id=-100,
        padding=True
    )

    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=CONFIG['EARLY_STOPPING_PATIENCE'])
        ]
    )

    # Train
    trainer.train()

    # store results
    models_results.append((model, tokenizer))

    print("\nEvaluating model...")
    eval_results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"  {key}: {value}")

    # clean memm
    del model
    del tokenizer
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

print("\nAll models trained successfully!")



=== TRAINING MODEL 1: r=8, lora_alpha=16 ===


Loading model: facebook/mbart-large-50


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]


Configuring LoRA...
trainable params: 4,325,376 || all params: 615,204,864 || trainable%: 0.7031

Preprocessing datasets...


Tokenizing train (num_proc=4):   0%|          | 0/1922 [00:00<?, ? examples/s]

Tokenizing validation (num_proc=4):   0%|          | 0/214 [00:00<?, ? examples/s]

Tokenizing test (num_proc=4):   0%|          | 0/363 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss
1,6.8691,6.182165
2,4.8419,4.463041
3,4.115,3.925009
4,4.2665,3.059696
5,2.9578,2.794426
6,2.8723,2.683839
7,2.8057,2.6377
8,2.7728,2.626582
9,2.7443,2.61525
10,2.7289,2.614149



Evaluating model...



Evaluation Results:
  eval_loss: 2.614149332046509
  eval_runtime: 3.2331
  eval_samples_per_second: 66.191
  eval_steps_per_second: 8.351
  epoch: 10.0


=== TRAINING MODEL 2: r=16, lora_alpha=32 ===


Loading model: facebook/mbart-large-50

Configuring LoRA...
trainable params: 8,650,752 || all params: 619,530,240 || trainable%: 1.3963

Preprocessing datasets...


Tokenizing train (num_proc=4):   0%|          | 0/1922 [00:00<?, ? examples/s]

Tokenizing validation (num_proc=4):   0%|          | 0/214 [00:00<?, ? examples/s]

Tokenizing test (num_proc=4):   0%|          | 0/363 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss
1,6.3628,4.759487
2,4.4835,3.915739
3,3.6474,2.936852
4,2.8799,2.660164
5,2.7145,2.585855
6,2.6633,2.53109
7,2.6191,2.506062
8,2.5893,2.496741
9,2.5695,2.488935
10,2.5578,2.490527



Evaluating model...



Evaluation Results:
  eval_loss: 2.4889354705810547
  eval_runtime: 2.8545
  eval_samples_per_second: 74.969
  eval_steps_per_second: 9.459
  epoch: 10.0


=== TRAINING MODEL 3: r=32, lora_alpha=64 ===


Loading model: facebook/mbart-large-50

Configuring LoRA...
trainable params: 17,301,504 || all params: 628,180,992 || trainable%: 2.7542

Preprocessing datasets...


Tokenizing train (num_proc=4):   0%|          | 0/1922 [00:00<?, ? examples/s]

Tokenizing validation (num_proc=4):   0%|          | 0/214 [00:00<?, ? examples/s]

Tokenizing test (num_proc=4):   0%|          | 0/363 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss
1,5.8298,4.511625
2,4.0287,3.247535
3,2.8142,2.611278
4,2.6837,2.533974
5,2.5477,2.465314
6,2.513,2.438838
7,2.4748,2.419171
8,2.4467,2.414804
9,2.4292,2.40582
10,2.4203,2.407368



Evaluating model...



Evaluation Results:
  eval_loss: 2.4058196544647217
  eval_runtime: 2.8604
  eval_samples_per_second: 74.814
  eval_steps_per_second: 9.439
  epoch: 10.0


=== TRAINING MODEL 4: r=64, lora_alpha=128 ===


Loading model: facebook/mbart-large-50

Configuring LoRA...
trainable params: 34,603,008 || all params: 645,482,496 || trainable%: 5.3608

Preprocessing datasets...


Tokenizing train (num_proc=4):   0%|          | 0/1922 [00:00<?, ? examples/s]

Tokenizing validation (num_proc=4):   0%|          | 0/214 [00:00<?, ? examples/s]

Tokenizing test (num_proc=4):   0%|          | 0/363 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss
1,5.1285,4.06291
2,3.7672,2.700506
3,2.6082,2.473125
4,2.4708,2.400123
5,2.3829,2.369742
6,2.353,2.348262
7,2.8452,2.990387
8,3.416,4.006779
9,3.0114,2.684305
10,2.6388,2.653162



Evaluating model...



Evaluation Results:
  eval_loss: 2.348262310028076
  eval_runtime: 2.8734
  eval_samples_per_second: 74.477
  eval_steps_per_second: 9.397
  epoch: 10.0

All models trained successfully!


#**Evaluation**

In [None]:
def normalize_text(model, informal_text):
    device = model.device
    model.eval()

    input_text = "Normalisasikan: " + informal_text
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=CONFIG['MAX_INPUT_LENGTH'],
        truncation=True
    ).to(device)

    forced_bos_token_id = tokenizer.lang_code_to_id["id_ID"]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=CONFIG['MAX_TARGET_LENGTH'],
            num_beams=5,
            early_stopping=True,
            forced_bos_token_id=forced_bos_token_id,
            no_repeat_ngram_size=2,
            length_penalty=1.0
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def predict_batch(model, texts, batch_size=8):
    device = model.device
    model.eval()

    forced_bos_token_id = tokenizer.lang_code_to_id["id_ID"]
    all_predictions = []

    texts = ["Normalisasikan: " + text for text in texts]

    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
        batch = texts[i:i+batch_size]

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            max_length=CONFIG['MAX_INPUT_LENGTH'],
            truncation=True,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=CONFIG['MAX_TARGET_LENGTH'],
                num_beams=5,
                early_stopping=True,
                forced_bos_token_id=forced_bos_token_id,
                no_repeat_ngram_size=2
            )

        predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_predictions.extend([p.strip() for p in predictions])

    return all_predictions

In [None]:
def simple_tokenize(text):
    import re
    tokens = re.findall(r"\w+|[^\w\s]", text.lower(), re.UNICODE)
    return tokens

def token_accuracy(pred, truth):
    pred_tokens = simple_tokenize(pred)
    truth_tokens = simple_tokenize(truth)
    min_len = min(len(pred_tokens), len(truth_tokens))
    correct = sum([1 for i in range(min_len) if pred_tokens[i] == truth_tokens[i]])
    return correct / max(len(truth_tokens), 1)

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def bleu_score(pred, truth):
    pred_tokens = simple_tokenize(pred)
    truth_tokens = [simple_tokenize(truth)]
    return sentence_bleu(truth_tokens, pred_tokens, smoothing_function=SmoothingFunction().method1)

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
def rouge_l_score(pred, truth):
    return scorer.score(truth, pred)['rougeL'].fmeasure

In [None]:
all_model_metrics = []

for idx, (model, tokenizer) in enumerate(models_results, 1):
    print(f"\n==================== EVALUATION MODEL {idx} ====================\n")

    results_df = test_df[150:200].copy()
    test_predictions = predict_batch(model, results_df['informal'].tolist())
    results_df['normalized'] = test_predictions

    token_acc_list = []
    bleu_list = []
    rouge_list = []

    for row in results_df.itertuples():
        token_acc_list.append(token_accuracy(row.normalized, row.formal))
        bleu_list.append(bleu_score(row.normalized, row.formal))
        rouge_list.append(rouge_l_score(row.normalized, row.formal))

    print("=== Evaluation Metrics ===")
    print(f"Average token-level accuracy: {np.mean(token_acc_list)*100:.2f}%")
    print(f"Average BLEU score          : {np.mean(bleu_list)*100:.2f}%")
    print(f"Average ROUGE-L             : {np.mean(rouge_list)*100:.2f}%")

    metrics = {
        "model_id": idx,
        "avg_token_accuracy": float(np.mean(token_acc_list)),
        "avg_bleu": float(np.mean(bleu_list)),
        "avg_rougeL": float(np.mean(rouge_list)),
        "results": results_df.to_dict(orient="records")
    }
    all_model_metrics.append(metrics)

    del model
    del tokenizer
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()





Predicting:   0%|          | 0/7 [00:00<?, ?it/s]

=== Evaluation Metrics ===
Average token-level accuracy: 40.22%
Average BLEU score          : 47.16%
Average ROUGE-L             : 77.01%




Predicting:   0%|          | 0/7 [00:00<?, ?it/s]

=== Evaluation Metrics ===
Average token-level accuracy: 41.91%
Average BLEU score          : 47.45%
Average ROUGE-L             : 77.54%




Predicting:   0%|          | 0/7 [00:00<?, ?it/s]

=== Evaluation Metrics ===
Average token-level accuracy: 43.40%
Average BLEU score          : 49.57%
Average ROUGE-L             : 79.99%




Predicting:   0%|          | 0/7 [00:00<?, ?it/s]

=== Evaluation Metrics ===
Average token-level accuracy: 41.09%
Average BLEU score          : 50.18%
Average ROUGE-L             : 80.27%


In [None]:
metrics_df = pd.DataFrame(all_model_metrics)
metrics_df["avg_token_accuracy"] = metrics_df["avg_token_accuracy"] * 100
metrics_df["avg_bleu"] = metrics_df["avg_bleu"] * 100
metrics_df["avg_rougeL"] = metrics_df["avg_rougeL"] * 100
metrics_df.to_csv("models_evaluation_metrics.csv", index=False)

for idx, row in metrics_df.iterrows():
    print(f"\n=== Model {row['model_id']} ===")
    print(f"Avg Token Accuracy: {row['avg_token_accuracy']:.2f}%")
    print(f"Avg BLEU Score: {row['avg_bleu']:.2f}%")
    print(f"Avg ROUGE-L Score: {row['avg_rougeL']:.2f}%\n")

    print("Examples Results:")
    for i, r in enumerate(row['results'][45:50], 1):
        print(f"[{i}]\noriginal   : {r['informal']}")
        print(f"normalized : {r['normalized']}")
        print(f"truth      : {r['formal']}")
        print("-" * 50)


=== Model 1 ===
Avg Token Accuracy: 40.22%
Avg BLEU Score: 47.16%
Avg ROUGE-L Score: 77.01%

Examples Results:
[1]
original   : limit transfer dr bca ke bank lain per hari brp ya via klikbca ?
normalized : limit transfer dari bca ke bank lain per hari berapa ? via klikbca ?
truth      : batas pengiriman dari bca ke bank lain per hari berapa melalu klikbca ?
--------------------------------------------------
[2]
original   : ti ati tar nyasar lagi ke kampung lain .
normalized : ti ati tar nyasar lagi ke kampung lain .
truth      : hati - hati nanti tersesat lagi ke kampung lain .
--------------------------------------------------
[3]
original   : gak bisa . gagal terus , ra aura .
normalized : tidak bisa . gagal terus , ra aura !
truth      : tidak bisa . gagal terus , ra aura .
--------------------------------------------------
[4]
original   : malam kak , apa cs disini aktif ? mau sampaikan keluhan tracking paket nyasar .
normalized : malam kak , apa cs disini aktif ? mau sampaikan k

#**Save Model**

In [None]:
from google.colab import files
import shutil
import os

for idx, (model, tokenizer) in enumerate(models_results, 1):
    folder_name = f"text-norm-model-{idx}"
    print(f"\nSaving Model {idx} to '{folder_name}/' ...")

    os.makedirs(folder_name, exist_ok=True)

    model.save_pretrained(folder_name)
    tokenizer.save_pretrained(folder_name)
    print(f"LoRA adapters for Model {idx} saved to '{folder_name}/'")

    zip_path = f"{folder_name}.zip"
    shutil.make_archive(folder_name, 'zip', folder_name)
    files.download(zip_path)
    print(f"Model {idx} archived and ready for download.\n")

print("All models saved and archived successfully!")


Saving Model 1 to 'text-norm-model-1/' ...
LoRA adapters for Model 1 saved to 'text-norm-model-1/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 1 archived and ready for download.


Saving Model 2 to 'text-norm-model-2/' ...
LoRA adapters for Model 2 saved to 'text-norm-model-2/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 2 archived and ready for download.


Saving Model 3 to 'text-norm-model-3/' ...
LoRA adapters for Model 3 saved to 'text-norm-model-3/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 3 archived and ready for download.


Saving Model 4 to 'text-norm-model-4/' ...
LoRA adapters for Model 4 saved to 'text-norm-model-4/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 4 archived and ready for download.

All models saved and archived successfully!
