# **Import Library**

In [None]:
import warnings
warnings.filterwarnings("ignore")

!pip install -U pip --quiet
!pip install evaluate --quiet
!pip install "unsloth[colab-new]" --extra-index-url https://download.pytorch.org/whl/cu124 --quiet
!pip install trl peft accelerate bitsandbytes datasets pandas --quiet
!pip install rouge_score --quiet

import torch
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments
from peft import LoraConfig, TaskType
import numpy as np
from transformers import DataCollatorForLanguageModeling
import evaluate
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.0/1.8 MB[0m [31m31.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for rouge_score (pyproject.toml) ... [?25l[?25hdone
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# **Load Dataset**

**Overview Dataset**

In [None]:
print("Loading STIF-Indonesia dataset...")

dataset = load_dataset("haryoaw/stif-indonesia")
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['dev'])
test_df = pd.DataFrame(dataset['test'])

print(f"\nTrain: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
print("\nSample data:")
train_df.head()

Loading STIF-Indonesia dataset...


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

dev.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/214 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/363 [00:00<?, ? examples/s]


Train: 1922 | Val: 214 | Test: 363

Sample data:


Unnamed: 0,informal,formal
0,alhamdulillah stlh libur xxxnumberxxx hari onb...,alhamdulillah setelah libur xxxnumberxxx hari ...
1,selamat sore min . saya mau pesan tiket ka via...,selamat sore admin . saya mau pesan tiket ka v...
2,iya kak terimakasih . tapi tadi sudah datang k...,iya kak terima kasih . tetapi tadi sudah datan...
3,malam min xxxuserxxx xxxuserxxx situs kalian e...,"malam admin xxxuserxxx xxxuserxxx , apakah sit..."
4,"min pembelian token pln apa ada kendala , ini ...","admin , pembelian token pln apa ada kendala ? ..."


# **Preprocess Datasets**

**Template Prompt for CausalLM**

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

**Tokenizing**

In [None]:
def tokenize_with_masking(examples, tokenizer):
    EOS_TOKEN = tokenizer.eos_token

    instruction = "Ubah teks berikut menjadi bahasa Indonesia yang baku dan sesuai dengan kata di kamus. Normalisasikan semua kata slang menjadi bentuk formal sehingga kalimat menjadi mudah dibaca."

    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for informal, formal in zip(examples["informal"], examples["formal"]):
        prompt_without_response = alpaca_prompt.format(instruction, informal, "")
        full_text = alpaca_prompt.format(instruction, informal, formal) + EOS_TOKEN

        # Tokenize
        full_tokens = tokenizer(full_text, truncation=True, max_length=max_seq_length, padding=False)
        prompt_tokens = tokenizer(prompt_without_response, truncation=True, max_length=max_seq_length, padding=False)

        input_ids = full_tokens["input_ids"]
        attention_mask = full_tokens["attention_mask"]

        # Label mask
        labels = input_ids.copy()
        prompt_length = len(prompt_tokens["input_ids"])

        for i in range(prompt_length):
            labels[i] = -100

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

# **Modeling and Pipeline Training**

**Training Configuration**

In [None]:
print("Setting up training configuration...")
training_args = UnslothTrainingArguments(
    # Output
    output_dir="./results",

    # Training hyperparameters
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,

    # Learning rate
    learning_rate=2e-4,
    warmup_ratio=0.03,

    # Training duration
    num_train_epochs=3,
    max_steps=-1,

    # Optimizer
    optim="adamw_8bit",
    weight_decay=0.01,

    # Learning rate scheduler
    lr_scheduler_type="cosine",

    # Logging
    logging_steps=50,
    logging_dir="./logs",

    # Evaluation
    eval_strategy="steps",
    eval_steps=50,

    # Saving
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Mixed precision
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),

    # Others
    seed=3407,
    report_to="none",
    remove_unused_columns=False,
)

max_seq_length = 2048
dtype = None
load_in_4bit = True

lora_configs = [
    {"r": 8,  "lora_alpha": 16},
    {"r": 16, "lora_alpha": 32},
    {"r": 32, "lora_alpha": 64},
]

Setting up training configuration...


**Pipeline Training**

In [None]:
# PIPELINE TRAINING
models_results = []

for idx, cfg in enumerate(lora_configs, 1):
    print(f"\n\n=== TRAINING MODEL {idx}: r={cfg['r']}, lora_alpha={cfg['lora_alpha']} ===\n")

    # Load base model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-1B-Instruct",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

    # Setup LoRA
    lora_config = {
        "r": cfg["r"],
        "lora_alpha": cfg["lora_alpha"],
        "lora_dropout": 0.1,
        "bias": "none",
        "target_modules": [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        "use_gradient_checkpointing": "unsloth",
        "use_rslora": True,
        "random_state": 3407,
        "loftq_config": None,
    }
    model = FastLanguageModel.get_peft_model(model, **lora_config)

    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(val_df)

    # Tokenize datasets
    train_dataset_tokenized = train_dataset.map(
        lambda x: tokenize_with_masking(x, tokenizer), batched=True, remove_columns=train_dataset.column_names
    )
    eval_dataset_tokenized = eval_dataset.map(
        lambda x: tokenize_with_masking(x, tokenizer), batched=True, remove_columns=eval_dataset.column_names
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Trainer
    trainer = UnslothTrainer(
        model=model,
        train_dataset=train_dataset_tokenized,
        eval_dataset=eval_dataset_tokenized,
        processing_class=tokenizer,
        args=training_args,
        data_collator=data_collator,
        compute_metrics=None,
        preprocess_logits_for_metrics=None
    )

    # Train
    trainer.train()

    # store results
    models_results.append((model, tokenizer))

    print("\nEvaluating model...")
    eval_results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"  {key}: {value}")

    # clean mem
    del model
    del tokenizer
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

print("\nAll models trained successfully!")



=== TRAINING MODEL 1: r=8, lora_alpha=16 ===

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 16 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,922 | Num Epochs = 3 | Total steps = 363
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,1.0258,0.696894
100,0.7025,0.65016
150,0.5717,0.637368
200,0.4398,0.613589
250,0.3939,0.616142
300,0.2544,0.643964
350,0.2251,0.648785


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient



Evaluating model...



Evaluation Results:
  eval_loss: 0.6135830283164978
  eval_runtime: 7.0188
  eval_samples_per_second: 30.489
  eval_steps_per_second: 7.694
  epoch: 3.0


=== TRAINING MODEL 2: r=16, lora_alpha=32 ===

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,922 | Num Epochs = 3 | Total steps = 363
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss,Validation Loss
50,0.9548,0.70653
100,0.6957,0.63016
150,0.5379,0.635032
200,0.3781,0.629245
250,0.3281,0.628155
300,0.1629,0.669591
350,0.1323,0.676155



Evaluating model...



Evaluation Results:
  eval_loss: 0.6282569766044617
  eval_runtime: 7.0091
  eval_samples_per_second: 30.532
  eval_steps_per_second: 7.704
  epoch: 3.0


=== TRAINING MODEL 3: r=32, lora_alpha=64 ===

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,922 | Num Epochs = 3 | Total steps = 363
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)


Step,Training Loss,Validation Loss
50,1.01,0.767831
100,0.7657,0.709595
150,0.5927,0.71742
200,0.4057,0.658566
250,0.3363,0.651552
300,0.1351,0.684535
350,0.1133,0.686678



Evaluating model...



Evaluation Results:
  eval_loss: 0.6515281796455383
  eval_runtime: 7.0862
  eval_samples_per_second: 30.199
  eval_steps_per_second: 7.62
  epoch: 3.0

All models trained successfully!


# **Evaluation**

In [None]:
# --- Alpaca template ---
alpaca_prompt_norm = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Ubah teks berikut menjadi bahasa Indonesia yang baku dan sesuai dengan kata di kamus. Normalisasikan semua kata slang menjadi bentuk formal sehingga kalimat menjadi mudah dibaca.

### Input:
{}

### Response:
"""

def simple_tokenize(text):
    import re
    tokens = re.findall(r"\w+|[^\w\s]", text.lower(), re.UNICODE)
    return tokens

def token_accuracy(pred, truth):
    pred_tokens = simple_tokenize(pred)
    truth_tokens = simple_tokenize(truth)
    min_len = min(len(pred_tokens), len(truth_tokens))
    correct = sum([1 for i in range(min_len) if pred_tokens[i] == truth_tokens[i]])
    return correct / max(len(truth_tokens), 1)

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def bleu_score(pred, truth):
    pred_tokens = simple_tokenize(pred)
    truth_tokens = [simple_tokenize(truth)]
    return sentence_bleu(truth_tokens, pred_tokens, smoothing_function=SmoothingFunction().method1)

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
def rouge_l_score(pred, truth):
    return scorer.score(truth, pred)['rougeL'].fmeasure

all_model_metrics = []

for idx, (model, tokenizer) in enumerate(models_results, 1):
    print(f"\n==================== EVALUATION MODEL {idx} ====================\n")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    FastLanguageModel.for_inference(model)

    results = []
    for _, row in test_df[150:200].iterrows():
        text = row['informal']
        full_prompt = alpaca_prompt_norm.format(text)
        inputs = tokenizer([full_prompt], return_tensors="pt").to(device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=0.3,
            top_p=0.9,
            do_sample=False,
        )

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = decoded.split("### Response:")[-1].strip()

        results.append({
            "original": text,
            "normalized": response,
            "truth": row['formal']
        })

    token_acc_list = [token_accuracy(r['normalized'], r['truth']) for r in results]
    bleu_list = [bleu_score(r['normalized'], r['truth']) for r in results]
    rouge_list = [rouge_l_score(r['normalized'], r['truth']) for r in results]

    print("=== Evaluation Metrics ===")
    print(f"Average token-level accuracy: {np.mean(token_acc_list)*100:.2f}%")
    print(f"Average BLEU score          : {np.mean(bleu_list)*100:.2f}%")
    print(f"Average ROUGE-L             : {np.mean(rouge_list)*100:.2f}%")

    metrics = {
        "model_id": idx,
        "avg_token_accuracy": np.mean(token_acc_list),
        "avg_bleu": np.mean(bleu_list),
        "avg_rougeL": np.mean(rouge_list),
        "results": results
    }
    all_model_metrics.append(metrics)

    del model
    del tokenizer
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()



=== Evaluation Metrics ===
Average token-level accuracy: 43.42%
Average BLEU score          : 50.14%
Average ROUGE-L             : 80.69%


=== Evaluation Metrics ===
Average token-level accuracy: 45.01%
Average BLEU score          : 53.18%
Average ROUGE-L             : 82.47%


=== Evaluation Metrics ===
Average token-level accuracy: 46.71%
Average BLEU score          : 54.98%
Average ROUGE-L             : 82.73%


In [None]:
metrics_df = pd.DataFrame(all_model_metrics)
metrics_df["avg_token_accuracy"] = metrics_df["avg_token_accuracy"] * 100
metrics_df["avg_bleu"] = metrics_df["avg_bleu"] * 100
metrics_df["avg_rougeL"] = metrics_df["avg_rougeL"] * 100
metrics_df.to_csv("models_evaluation_metrics.csv", index=False)

for idx, row in metrics_df.iterrows():
    print(f"\n=== Model {row['model_id']} ===")
    print(f"Avg Token Accuracy: {row['avg_token_accuracy']:.2f}%")
    print(f"Avg BLEU Score: {row['avg_bleu']:.2f}%")
    print(f"Avg ROUGE-L Score: {row['avg_rougeL']:.2f}%\n")

    print("Examples Results:")
    for i, r in enumerate(row['results'][45:50], 1):
        print(f"[{i}]\noriginal   : {r['original']}")
        print(f"normalized : {r['normalized']}")
        print(f"truth      : {r['truth']}")
        print("-" * 50)


=== Model 1 ===
Avg Token Accuracy: 43.42%
Avg BLEU Score: 50.14%
Avg ROUGE-L Score: 80.69%

Examples Results:
[1]
original   : limit transfer dr bca ke bank lain per hari brp ya via klikbca ?
normalized : batas transfer dari bca ke bank lain per hari berapa? melalui klikbca?
truth      : batas pengiriman dari bca ke bank lain per hari berapa melalu klikbca ?
--------------------------------------------------
[2]
original   : ti ati tar nyasar lagi ke kampung lain .
normalized : ti ati tar, nyasar lagi ke kampung lain.
truth      : hati - hati nanti tersesat lagi ke kampung lain .
--------------------------------------------------
[3]
original   : gak bisa . gagal terus , ra aura .
normalized : tidak bisa. selalu gagal. ra aura.
truth      : tidak bisa . gagal terus , ra aura .
--------------------------------------------------
[4]
original   : malam kak , apa cs disini aktif ? mau sampaikan keluhan tracking paket nyasar .
normalized : malam kak, apa cs di sini aktif? mau sampaikan ke

# **Save Model**

In [None]:
from google.colab import files
import shutil
import os

for idx, (model, tokenizer) in enumerate(models_results, 1):
    folder_name = f"text-norm-model-{idx}"
    print(f"\nSaving Model {idx} to '{folder_name}/' ...")

    os.makedirs(folder_name, exist_ok=True)

    model.save_pretrained(folder_name)
    tokenizer.save_pretrained(folder_name)
    print(f"LoRA adapters for Model {idx} saved to '{folder_name}/'")

    zip_path = f"{folder_name}.zip"
    shutil.make_archive(folder_name, 'zip', folder_name)
    files.download(zip_path)
    print(f"Model {idx} archived and ready for download.\n")

print("All models saved and archived successfully!")


Saving Model 1 to 'text-norm-model-1/' ...
LoRA adapters for Model 1 saved to 'text-norm-model-1/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 1 archived and ready for download.


Saving Model 2 to 'text-norm-model-2/' ...
LoRA adapters for Model 2 saved to 'text-norm-model-2/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 2 archived and ready for download.


Saving Model 3 to 'text-norm-model-3/' ...
LoRA adapters for Model 3 saved to 'text-norm-model-3/'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model 3 archived and ready for download.

All models saved and archived successfully!
