In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import torch
import pandas as pd
from datasets import load_from_disk
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          TrainingArguments, Trainer, DataCollatorWithPadding,
                          EarlyStoppingCallback)
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss

In [3]:
# Konfigurasi
model_name = "indobenchmark/indobert-base-p2"   # ganti kalau mau model lain / lokal
dir = "/content/drive/MyDrive/deteksi_cd-indo"  # pusat direktori
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# Hyperparameter training
NUM_EPOCHS = 15
PER_DEVICE_TRAIN_BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH_SIZE = 32
LR = 1e-5
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 200
MAX_GRAD_NORM = 1.0
SEED = 42
SAVE_TOTAL_LIMIT = 3

os.makedirs(dir, exist_ok=True)

In [5]:
# Muat tokenized dataset (sudah tersimpan)
train_ds = load_from_disk(os.path.join(dir, "train"))
val_ds   = load_from_disk(os.path.join(dir, "validation"))
test_ds  = load_from_disk(os.path.join(dir, "test"))

print("Loaded tokenized datasets. Sample columns:", train_ds.column_names)

Loaded tokenized datasets. Sample columns: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


In [6]:
# Muat tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# Pastikan num_labels sesuai dataset: ambil dari config jika sudah ada,
# atau deduksi dari label values di train csv
train_csv = pd.read_csv(os.path.join(dir, "train.csv"))
num_labels = len(sorted(train_csv["label_id"].unique()))
print("Num labels detected:", num_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    ignore_mismatched_sizes=True  # aman jika head shape beda
)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Num labels detected: 11


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
# Hitung class weights dari train.csv
y = train_csv["label_id"].values
classes = np.unique(y)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y)  # ndarray
class_weights = torch.tensor(cw, dtype=torch.float)  # akan dipindahkan ke device di compute_loss
print("Class weights:", cw)

Class weights: [2.26780303 2.82006594 2.04613807 0.82340806 1.22583948 1.75571848
 2.10958421 0.21563896 1.50768068 1.55952071 1.42853734]


In [8]:
# Custom Trainer untuk memakai class_weights pada CrossEntropyLoss
class WeightedTrainer(Trainer):
    def __init__(self, class_weights_tensor=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights_tensor = class_weights_tensor

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Perbaikan: terima **kwargs supaya kompatibel dengan versi transformers
        yang mengirimkan argumen tambahan (mis. num_items_in_batch).
        """
        labels = inputs.get("labels")
        # Pastikan kita tidak mengirim 'labels' dua kali ke model jika model
        # menerima labels sebagai argumen (AutoModelForSequenceClassification menerima labels)
        model_inputs = {k: v for k, v in inputs.items() if k != "labels"}

        # Panggil model (jika model mendukung labels, bisa juga dikirim langsung,
        # tapi kita ingin mengontrol loss sehingga tidak mengandalkan model's loss)
        outputs = model(**model_inputs)
        logits = outputs.logits

        # Move weights ke device yang sama dengan logits
        if self.class_weights_tensor is not None:
            weight = self.class_weights_tensor.to(logits.device)
            loss_fct = CrossEntropyLoss(weight=weight)
        else:
            loss_fct = CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [9]:
# compute_metrics -> macro-F1 utama
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    # classification report string (per-class) - optional, but helpful to save
    clf_report = classification_report(labels, preds, digits=4, zero_division=0)
    return {
        "accuracy": acc,
        "eval_macro_f1": macro_f1,
        "eval_weighted_f1": weighted_f1,
        "clf_report": clf_report  # note: Trainer expects numeric metrics; this string will be saved in logs though
    }

In [10]:
# TrainingArguments dan Trainer
training_args = TrainingArguments(
    output_dir=dir,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    eval_steps=None,
    save_total_limit=SAVE_TOTAL_LIMIT,
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    seed=SEED,
    fp16=True if torch.cuda.is_available() else False,  # mixed precision jika ada GPU
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights_tensor=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

  super().__init__(*args, **kwargs)
You are adding a <class 'transformers.trainer_callback.EarlyStoppingCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
EarlyStoppingCallback
NotebookProgressCallback


In [11]:
# Mulai training
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdamaifani12[0m ([33mdamaifani12-uin-sunan-kalijaga[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Accuracy,Clf Report
1,2.0518,1.437096,0.537322,0.606681,0.552139,precision recall f1-score support  0 0.6923 0.3000 0.4186 30  1 0.9474 0.7500 0.8372 24  2 0.1417 0.5455 0.2250 33  3 0.5926 0.3855 0.4672 83  4 0.7619 0.5818 0.6598 55  5 0.1458 0.5385 0.2295 39  6 0.8947 0.5312 0.6667 32  7 0.8874 0.6254 0.7337 315  8 0.6087 0.3111 0.4118 45  9 0.4286 0.4773 0.4516 44  10 0.9444 0.7083 0.8095 48  accuracy 0.5521 748  macro avg 0.6405 0.5231 0.5373 748 weighted avg 0.7282 0.5521 0.6067 748
2,1.2206,1.24375,0.552121,0.602912,0.569519,precision recall f1-score support  0 0.4865 0.6000 0.5373 30  1 0.7407 0.8333 0.7843 24  2 0.6522 0.4545 0.5357 33  3 0.6944 0.3012 0.4202 83  4 0.5205 0.6909 0.5938 55  5 0.2530 0.5385 0.3443 39  6 0.6000 0.4688 0.5263 32  7 0.9479 0.5778 0.7179 315  8 0.1895 0.6444 0.2929 45  9 0.5000 0.5909 0.5417 44  10 0.7872 0.7708 0.7789 48  accuracy 0.5695 748  macro avg 0.5793 0.5883 0.5521 748 weighted avg 0.7168 0.5695 0.6029 748
3,0.9294,1.220146,0.576924,0.646921,0.627005,precision recall f1-score support  0 0.2794 0.6333 0.3878 30  1 0.8077 0.8750 0.8400 24  2 0.5000 0.5152 0.5075 33  3 0.4805 0.4458 0.4625 83  4 0.7143 0.5455 0.6186 55  5 0.2414 0.5385 0.3333 39  6 0.6800 0.5312 0.5965 32  7 0.8721 0.7143 0.7853 315  8 0.4595 0.3778 0.4146 45  9 0.5600 0.6364 0.5957 44  10 0.8409 0.7708 0.8043 48  accuracy 0.6270 748  macro avg 0.5851 0.5985 0.5769 748 weighted avg 0.6885 0.6270 0.6469 748
4,0.7451,1.231061,0.554525,0.60626,0.589572,precision recall f1-score support  0 0.4222 0.6333 0.5067 30  1 0.7000 0.8750 0.7778 24  2 0.3519 0.5758 0.4368 33  3 0.4231 0.3976 0.4099 83  4 0.6491 0.6727 0.6607 55  5 0.2727 0.5385 0.3621 39  6 0.5385 0.6562 0.5915 32  7 0.9433 0.5810 0.7191 315  8 0.3617 0.3778 0.3696 45  9 0.4576 0.6136 0.5243 44  10 0.6324 0.8958 0.7414 48  accuracy 0.5896 748  macro avg 0.5229 0.6198 0.5545 748 weighted avg 0.6734 0.5896 0.6063 748
5,0.5243,1.34763,0.579689,0.657034,0.645722,precision recall f1-score support  0 0.3953 0.5667 0.4658 30  1 0.8750 0.8750 0.8750 24  2 0.4571 0.4848 0.4706 33  3 0.4314 0.5301 0.4757 83  4 0.5738 0.6364 0.6034 55  5 0.3091 0.4359 0.3617 39  6 0.6154 0.5000 0.5517 32  7 0.8691 0.7587 0.8102 315  8 0.4146 0.3778 0.3953 45  9 0.5227 0.5227 0.5227 44  10 0.9048 0.7917 0.8444 48  accuracy 0.6457 748  macro avg 0.5789 0.5891 0.5797 748 weighted avg 0.6763 0.6457 0.6570 748
6,0.3532,1.482804,0.577826,0.648903,0.639037,precision recall f1-score support  0 0.3704 0.6667 0.4762 30  1 0.8400 0.8750 0.8571 24  2 0.3721 0.4848 0.4211 33  3 0.5132 0.4699 0.4906 83  4 0.5714 0.6545 0.6102 55  5 0.3276 0.4872 0.3918 39  6 0.4651 0.6250 0.5333 32  7 0.8604 0.7238 0.7862 315  8 0.5172 0.3333 0.4054 45  9 0.5778 0.5909 0.5843 44  10 0.8085 0.7917 0.8000 48  accuracy 0.6390 748  macro avg 0.5658 0.6093 0.5778 748 weighted avg 0.6735 0.6390 0.6489 748
7,0.2179,1.598135,0.600362,0.664896,0.656417,precision recall f1-score support  0 0.4872 0.6333 0.5507 30  1 0.9130 0.8750 0.8936 24  2 0.5185 0.4242 0.4667 33  3 0.5059 0.5181 0.5119 83  4 0.5893 0.6000 0.5946 55  5 0.2877 0.5385 0.3750 39  6 0.5882 0.6250 0.6061 32  7 0.8470 0.7556 0.7987 315  8 0.4706 0.3556 0.4051 45  9 0.5510 0.6136 0.5806 44  10 0.8298 0.8125 0.8211 48  accuracy 0.6564 748  macro avg 0.5989 0.6138 0.6004 748 weighted avg 0.6820 0.6564 0.6649 748
8,0.1305,1.792582,0.59648,0.665474,0.65508,precision recall f1-score support  0 0.4146 0.5667 0.4789 30  1 0.8800 0.9167 0.8980 24  2 0.4054 0.4545 0.4286 33  3 0.4783 0.5301 0.5029 83  4 0.6800 0.6182 0.6476 55  5 0.2800 0.5385 0.3684 39  6 0.6667 0.6250 0.6452 32  7 0.8536 0.7587 0.8034 315  8 0.4848 0.3556 0.4103 45  9 0.6571 0.5227 0.5823 44  10 0.7800 0.8125 0.7959 48  accuracy 0.6551 748  macro avg 0.5982 0.6090 0.5965 748 weighted avg 0.6863 0.6551 0.6655 748
9,0.0813,1.925894,0.586397,0.658795,0.656417,precision recall f1-score support  0 0.4872 0.6333 0.5507 30  1 0.8800 0.9167 0.8980 24  2 0.5385 0.4242 0.4746 33  3 0.5758 0.4578 0.5101 83  4 0.5763 0.6182 0.5965 55  5 0.3182 0.5385 0.4000 39  6 0.5143 0.5625 0.5373 32  7 0.8339 0.7810 0.8066 315  8 0.4412 0.3333 0.3797 45  9 0.5814 0.5682 0.5747 44  10 0.6500 0.8125 0.7222 48  accuracy 0.6564 748  macro avg 0.5815 0.6042 0.5864 748 weighted avg 0.6700 0.6564 0.6588 748


TrainOutput(global_step=3375, training_loss=0.6654069010416667, metrics={'train_runtime': 591.0763, 'train_samples_per_second': 151.935, 'train_steps_per_second': 9.517, 'total_flos': 3526433011028670.0, 'train_loss': 0.6654069010416667, 'epoch': 9.0})

In [12]:
# 8) Simpan model & tokenizer final (best model sudah di-load karena load_best_model_at_end=True)
trainer.save_model(os.path.join(dir, "best_model"))
tokenizer.save_pretrained(os.path.join(dir, "best_model"))
print("Model and tokenizer saved to:", os.path.join(dir, "best_model"))

Model and tokenizer saved to: /content/drive/MyDrive/deteksi_cd-indo/best_model


In [13]:
# Evaluasi pada test set (load best model bila perlu)
print("Evaluating on test set...")
test_metrics = trainer.evaluate(eval_dataset=test_ds)
print("Test metrics (Trainer):", test_metrics)

# Jika mau confusion matrix / per-class report terpisah:
preds_output = trainer.predict(test_ds)
preds = np.argmax(preds_output.predictions, axis=1)
labels = preds_output.label_ids

Evaluating on test set...


Test metrics (Trainer): {'eval_macro_f1': 0.5660522155962026, 'eval_weighted_f1': 0.6391764659746146, 'eval_loss': 1.6644164323806763, 'eval_accuracy': 0.6248331108144193, 'eval_clf_report': '              precision    recall  f1-score   support\n\n           0     0.4595    0.5667    0.5075        30\n           1     0.9200    0.9583    0.9388        24\n           2     0.5833    0.6364    0.6087        33\n           3     0.4615    0.4337    0.4472        83\n           4     0.6522    0.5357    0.5882        56\n           5     0.2500    0.4737    0.3273        38\n           6     0.3953    0.5312    0.4533        32\n           7     0.8662    0.7373    0.7966       316\n           8     0.4200    0.4565    0.4375        46\n           9     0.3404    0.3721    0.3556        43\n          10     0.7826    0.7500    0.7660        48\n\n    accuracy                         0.6248       749\n   macro avg     0.5574    0.5865    0.5661       749\nweighted avg     0.6640    0.6248 

In [14]:
# classification report lengkap
print("Classification report (test):")
print(classification_report(labels, preds, digits=4, zero_division=0))

Classification report (test):
              precision    recall  f1-score   support

           0     0.4595    0.5667    0.5075        30
           1     0.9200    0.9583    0.9388        24
           2     0.5833    0.6364    0.6087        33
           3     0.4615    0.4337    0.4472        83
           4     0.6522    0.5357    0.5882        56
           5     0.2500    0.4737    0.3273        38
           6     0.3953    0.5312    0.4533        32
           7     0.8662    0.7373    0.7966       316
           8     0.4200    0.4565    0.4375        46
           9     0.3404    0.3721    0.3556        43
          10     0.7826    0.7500    0.7660        48

    accuracy                         0.6248       749
   macro avg     0.5574    0.5865    0.5661       749
weighted avg     0.6640    0.6248    0.6392       749

