<a href="https://colab.research.google.com/github/Aisyaululasmi/Assignment/blob/main/Assignment_3_PBAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import numpy as np
import torch

SEED = 42

def set_seed(sd=SEED):
    random.seed(sd)
    np.random.seed(sd)
    torch.manual_seed(sd)
    torch.cuda.manual_seed_all(sd)

In [2]:
set_seed()

In [3]:
from datasets import load_dataset
ds = load_dataset("intanm/indonesian-financial-sentiment-analysis")
df_train = ds["train"].to_pandas()
df_test = ds["test"].to_pandas()

print(df_train.head())

train.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1322 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/652 [00:00<?, ? examples/s]

                                                text  label
0  Kenalin Ini Rika, Teller BRI yang Nyikat Uang ...      0
1  Kepo Prospek UMKM? Yuk Daftar BRI Microfinance...      1
2  BRI dan Cita Tenun Indonesia Garap Pameran ANT...      1
3  Dirut BRI: 2023 Jadi Tahun Ekspansi, Masyaraka...      1
4  Lampaui Target, BRI Sukses Jual SBN SR018 Hing...      2


In [4]:
id2label = {0:"negative", 1:"neutral", 2:"positive"}
label2id = {v:k for k,v in id2label.items()}

In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [6]:
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback)
import evaluate, numpy as np

import transformers
from sklearn.metrics import confusion_matrix, classification_report
print(transformers.__version__)


def run_experiment(model_name, max_len=192, lr=2e-5, epochs=4, bs=16, warmup_ratio=0.1, wd=0.01, outdir=None):
    outdir = outdir or model_name.split("/")[-1]
    print(outdir)
    print(f"\n=== Fine-tuning {model_name} ===")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # GPT-2 (decoder-only) butuh pad_token
    if "gpt2" in model_name.lower() and tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=max_len)
    dss = ds.map(tok, batched=True)
    dss = dss.remove_columns([c for c in dss["train"].column_names if c not in ["input_ids","attention_mask","label"]])
    dss = dss.class_encode_column("label")

    # 3) Model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label=id2label, label2id=label2id
    )
    if "gpt2" in model_name.lower():
        model.config.pad_token_id = tokenizer.pad_token_id

    # 4) Metrics
    acc = evaluate.load("accuracy")
    prec = evaluate.load("precision")
    rec = evaluate.load("recall")
    f1 = evaluate.load("f1")

    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        res = {
            "accuracy": acc.compute(predictions=preds, references=p.label_ids)["accuracy"],
            "precision_macro": prec.compute(predictions=preds, references=p.label_ids, average="macro")["precision"],
            "recall_macro": rec.compute(predictions=preds, references=p.label_ids, average="macro")["recall"],
            "f1_macro": f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"],
            "f1_weighted": f1.compute(predictions=preds, references=p.label_ids, average="weighted")["f1"],
        }
        return res

    data_collator = DataCollatorWithPadding(tokenizer)

    train_args = TrainingArguments(
        output_dir=outdir,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        num_train_epochs=epochs,
        learning_rate=lr,
        weight_decay=wd,
        warmup_ratio=warmup_ratio,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=42,
    )  # tanpa evaluation_strategy/save_strategy/load_best_model_at_end

    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=dss["train"],
        eval_dataset=dss["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_res = trainer.evaluate()

    # 6) Error analysis sekilas (confusion matrix & contoh salah prediksi)
    preds = np.argmax(trainer.predict(dss["test"]).predictions, axis=1)
    y_true = np.array(dss["test"]["label"])
    cm = confusion_matrix(y_true, preds, labels=[0,1,2])
    print("\nConfusion Matrix [rows=true 0/1/2 neg/neu/pos, cols=pred]:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, preds, target_names=[id2label[i] for i in [0,1,2]]))

    # contoh 10 salah prediksi teratas
    wrong_idx = np.where(preds != y_true)[0][:10].tolist()
    subset = ds["test"].select(wrong_idx)  # HuggingFace way
    for ex, yhat in zip(subset, preds[wrong_idx]):
        print(f"[WRONG] {ex['text']}\n true={ex['label']}  pred={int(yhat)}\n")

    return eval_res

4.57.1


In [7]:
res_roberta_en   = run_experiment("roberta-base")                          # English encoder  :contentReference[oaicite:3]{index=3}
res_indobert     = run_experiment("indobenchmark/indobert-base-p2")       # Indonesian encoder  :contentReference[oaicite:4]{index=4}
res_xlmr         = run_experiment("xlm-roberta-base")                     # Multilingual encoder  :contentReference[oaicite:5]{index=5}
res_gpt2_id      = run_experiment("cahya/gpt2-small-indonesian-522M")

roberta-base

=== Fine-tuning roberta-base ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/1322 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1322 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/652 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/652 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss



Confusion Matrix [rows=true 0/1/2 neg/neu/pos, cols=pred]:
 [[132   8  16]
 [ 17  48 117]
 [  8  26 280]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.85      0.84       156
     neutral       0.59      0.26      0.36       182
    positive       0.68      0.89      0.77       314

    accuracy                           0.71       652
   macro avg       0.70      0.67      0.66       652
weighted avg       0.69      0.71      0.67       652

[WRONG] BRI: Tak Benar Ada Pegawai Kena Virus Korona, Hanya Radang Tenggorokan
 true=1  pred=0

[WRONG] BRI Mau Buyback Saham Rp 1,5 Triliun!
 true=1  pred=2

[WRONG] BRI Menanam Targetkan Tanam 1,75 Juta Bibit Pohon Produktif
 true=1  pred=2

[WRONG] BRI Buka Layanan Terbatas saat Idul Fitri 2022, Simak Jadwalnya
 true=1  pred=2

[WRONG] BRI Tawarkan Obligasi Berwawasan Lingkungan Rp 5 Triliun untuk Tahap I 2022
 true=1  pred=2

[WRONG] Jawab Kebutuhan Nasabah, BRI Luncurkan Produk

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/1322 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1322 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/652 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/652 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss



Confusion Matrix [rows=true 0/1/2 neg/neu/pos, cols=pred]:
 [[148   5   3]
 [ 13 114  55]
 [  3  46 265]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.95      0.93       156
     neutral       0.69      0.63      0.66       182
    positive       0.82      0.84      0.83       314

    accuracy                           0.81       652
   macro avg       0.80      0.81      0.80       652
weighted avg       0.80      0.81      0.81       652

[WRONG] BRI: Tak Benar Ada Pegawai Kena Virus Korona, Hanya Radang Tenggorokan
 true=1  pred=0

[WRONG] Virtual Coaching, Cara BRI Bina Pelaku UMKM Saat Pandemi Covid-19
 true=2  pred=1

[WRONG] BRI Menanam Targetkan Tanam 1,75 Juta Bibit Pohon Produktif
 true=1  pred=2

[WRONG] BRI Jawab Kebutuhan PMI Lewat BRIFast Remittance
 true=2  pred=1

[WRONG] Jawab Kebutuhan Nasabah, BRI Luncurkan Produk Asuransi PGH
 true=1  pred=2

[WRONG] Dengan Program Ini, BRI Kembangkan Kompetensi Tal

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/1322 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1322 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/652 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/652 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



Confusion Matrix [rows=true 0/1/2 neg/neu/pos, cols=pred]:
 [[147   5   4]
 [ 12  63 107]
 [  0  18 296]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.92      0.94      0.93       156
     neutral       0.73      0.35      0.47       182
    positive       0.73      0.94      0.82       314

    accuracy                           0.78       652
   macro avg       0.79      0.74      0.74       652
weighted avg       0.78      0.78      0.75       652

[WRONG] BRI: Tak Benar Ada Pegawai Kena Virus Korona, Hanya Radang Tenggorokan
 true=1  pred=0

[WRONG] BRI Mau Buyback Saham Rp 1,5 Triliun!
 true=1  pred=2

[WRONG] Virtual Coaching, Cara BRI Bina Pelaku UMKM Saat Pandemi Covid-19
 true=2  pred=1

[WRONG] BRI Menanam Targetkan Tanam 1,75 Juta Bibit Pohon Produktif
 true=1  pred=2

[WRONG] BRI Tawarkan Obligasi Berwawasan Lingkungan Rp 5 Triliun untuk Tahap I 2022
 true=1  pred=2

[WRONG] Jawab Kebutuhan Nasabah, BRI Luncurkan Prod

tokenizer_config.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/1322 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1322 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/652 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/652 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at cahya/gpt2-small-indonesian-522M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 0, 'bos_token_id': 0}.


Step,Training Loss



Confusion Matrix [rows=true 0/1/2 neg/neu/pos, cols=pred]:
 [[140  10   6]
 [ 14  91  77]
 [  6  45 263]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.90      0.89       156
     neutral       0.62      0.50      0.55       182
    positive       0.76      0.84      0.80       314

    accuracy                           0.76       652
   macro avg       0.75      0.75      0.75       652
weighted avg       0.75      0.76      0.75       652

[WRONG] BRI Mau Buyback Saham Rp 1,5 Triliun!
 true=1  pred=2

[WRONG] BRI Menanam Targetkan Tanam 1,75 Juta Bibit Pohon Produktif
 true=1  pred=2

[WRONG] BRI Tawarkan Obligasi Berwawasan Lingkungan Rp 5 Triliun untuk Tahap I 2022
 true=1  pred=2

[WRONG] Jawab Kebutuhan Nasabah, BRI Luncurkan Produk Asuransi PGH
 true=1  pred=2

[WRONG] Dengan Program Ini, BRI Kembangkan Kompetensi Talenta Muda
 true=1  pred=2

[WRONG] QLola by BRI, Solusi Terintegrasi Layanan Wholesale Banking
 t

In [8]:
def to_row(tag, res):
    return [tag, round(res["eval_accuracy"],4), round(res["eval_precision_macro"],4),
            round(res["eval_recall_macro"],4), round(res["eval_f1_macro"],4), round(res["eval_f1_weighted"],4)]

In [10]:
import pandas as pd

summary = pd.DataFrame(
    [to_row("roberta-base (EN)", res_roberta_en),
     to_row("IndoBERT-base-p2 (ID)", res_indobert),
     to_row("XLM-R-base (multi)", res_xlmr),
     to_row("GPT2-small-ID (decoder)", res_gpt2_id)],
    columns=["Model","Acc","Prec(m)","Rec(m)","F1(m)","F1(w)"]
)
print("\n=== Summary ===\n", summary)


=== Summary ===
                      Model     Acc  Prec(m)  Rec(m)   F1(m)   F1(w)
0        roberta-base (EN)  0.7055   0.7014  0.6672  0.6591  0.6743
1    IndoBERT-base-p2 (ID)  0.8083   0.8046  0.8063  0.8047  0.8054
2       XLM-R-base (multi)  0.7761   0.7948  0.7437  0.7415  0.7500
3  GPT2-small-ID (decoder)  0.7577   0.7528  0.7450  0.7460  0.7507
