### Environment Setup and Imports

Import all required libraries and dependencies for model training and evaluation.

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from collections import Counter
import torch
import torch.nn as nn

# IndoBERT Model Training

This notebook demonstrates the fine-tuning and evaluation of IndoBERT for Indonesian tweet classification. Each section is clearly marked with headers and brief explanations.

### Data Loading

Load the preprocessed train, validation, and test datasets.

In [2]:
train_df = pd.read_csv('../dataset/processed/train.csv')
val_df = pd.read_csv('../dataset/processed/validation.csv')
test_df = pd.read_csv('../dataset/processed/test.csv')

#### Data Overview

Display dataset shapes and preview the training data.

In [3]:
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)
print("\nTrain DataFrame head:")
print(train_df.head())

Train shape: (3500, 2)
Validation shape: (750, 2)
Test shape: (750, 2)

Train DataFrame head:
                                        cleaned_text  label
0  warga kp bayam itu yang lahannya dipakai buat ...      5
1  team pesona bobby kertanegara mulai turun tang...      5
2  kapolri jenderal listyo sigit prabowo menghadi...      6
3  ini lah hasil dari kinerja pak prabowo selama ...      5
4  puluhan ribu masyarakat menyambut kedatangan a...      5


#### Label Distribution

Analyze the distribution of labels in each dataset split.

In [4]:
# Check label distribution in train, validation, and test sets
def print_label_distribution(df, name):
    counts = df['label'].value_counts().sort_index()
    percentages = counts / counts.sum() * 100
    print(f"{name} label distribution:")
    for label, count, pct in zip(counts.index, counts.values, percentages.values):
        print(f"  Label {label}: {count} ({pct:.2f}%)")
    print()

print_label_distribution(train_df, "Train")
print_label_distribution(val_df, "Validation")
print_label_distribution(test_df, "Test")

Train label distribution:
  Label 0: 43 (1.23%)
  Label 1: 257 (7.34%)
  Label 2: 14 (0.40%)
  Label 3: 280 (8.00%)
  Label 4: 280 (8.00%)
  Label 5: 2081 (59.46%)
  Label 6: 411 (11.74%)
  Label 7: 134 (3.83%)

Validation label distribution:
  Label 0: 9 (1.20%)
  Label 1: 55 (7.33%)
  Label 2: 3 (0.40%)
  Label 3: 60 (8.00%)
  Label 4: 60 (8.00%)
  Label 5: 446 (59.47%)
  Label 6: 88 (11.73%)
  Label 7: 29 (3.87%)

Test label distribution:
  Label 0: 10 (1.33%)
  Label 1: 55 (7.33%)
  Label 2: 3 (0.40%)
  Label 3: 60 (8.00%)
  Label 4: 60 (8.00%)
  Label 5: 445 (59.33%)
  Label 6: 88 (11.73%)
  Label 7: 29 (3.87%)



### Model and Tokenizer Initialization

Load the IndoBERT model and tokenizer for sequence classification.

In [5]:
model_name = "indobenchmark/indobert-large-p2"
num_labels = 8

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Dataset Preparation and Tokenization

Convert data to Hugging Face Datasets and tokenize for model input.

In [None]:
# Convert pandas DataFrames to Hugging Face Datasets and tokenize
# Drop rows with missing cleaned_text
train_df_clean = train_df.dropna(subset=['cleaned_text'])
val_df_clean = val_df.dropna(subset=['cleaned_text'])
test_df_clean = test_df.dropna(subset=['cleaned_text'])

def tokenize_function(batch):
    return tokenizer(batch['cleaned_text'], truncation=True, max_length=256)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df_clean[['cleaned_text', 'label']])
val_dataset = Dataset.from_pandas(val_df_clean[['cleaned_text', 'label']])
test_dataset = Dataset.from_pandas(test_df_clean[['cleaned_text', 'label']])

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

### Metrics, Data Collator, and Trainer Setup

Define evaluation metrics, data collator, class weights, and configure the custom Trainer.

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Calculate class weights for weighted loss function
label_counts = Counter(train_df['label'])
num_classes = len(label_counts)
total_samples = len(train_df)
class_weights = []
for i in range(num_classes):
    count = label_counts.get(i, 0)
    if count == 0:
        class_weights.append(0.0)
    else:
        class_weights.append(total_samples / (num_classes * count))
class_weights = torch.tensor(class_weights, dtype=torch.float)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # Move class_weights to the correct device
        weights = class_weights.to(model.module.device if hasattr(model, 'module') else model.device)
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Improved training arguments
training_args = TrainingArguments(
    output_dir="../logs/indobert/results_indobert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir='../logs/indobert/detailed_logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=3,
    report_to=None,
    fp16=True
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
)

### Model Training

Train the IndoBERT model using the custom Trainer and early stopping.

In [None]:
trainer.train()

# Model Training Results

Summary of model performance and training history.

The model was trained on a Kaggle Notebook to leverage GPU acceleration, which significantly speeds up the training process for large models like IndoBERT. The training was configured for 15 epochs.

***

### Training History

The table below shows the model's performance metrics on the validation set at different steps throughout the training process.

| Step | Training Loss | Validation Loss | Accuracy | F1 | Precision | Recall |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| 100 | 1.949000 | 1.949710 | 0.321762 | 0.353605 | 0.543864 | 0.321762 |
| 200 | 1.489900 | 1.622133 | 0.514019 | 0.529652 | 0.676132 | 0.514019 |
| 300 | 1.328500 | 1.352426 | 0.463284 | 0.467727 | 0.724507 | 0.463284 |
| 400 | 0.714900 | 1.218028 | 0.683578 | 0.698927 | 0.769663 | 0.683578 |
| 500 | 0.525400 | 1.333650 | 0.724967 | 0.738125 | 0.781990 | 0.724967 |
| 600 | 0.230400 | 1.508070 | 0.781041 | 0.779882 | 0.785440 | 0.781041 |
| 700 | 0.170600 | 1.593431 | 0.777036 | 0.779755 | 0.790676 | 0.777036 |
| 800 | 0.036600 | 2.004199 | 0.783712 | 0.779772 | 0.778672 | 0.783712 |
| 900 | 0.055700 | 2.261222 | 0.773031 | 0.770436 | 0.772057 | 0.773031 |
| 1000 | 0.019700 | 2.102774 | 0.759680 | 0.763243 | 0.773193 | 0.759680 |
| 1100 | 0.131900 | 2.729170 | 0.798398 | 0.788991 | 0.784868 | 0.798398 |
| 1200 | 0.036700 | 2.929872 | 0.798398 | 0.790167 | 0.787715 | 0.798398 |
| 1300 | 0.078500 | 3.012218 | 0.794393 | 0.788406 | 0.784523 | 0.794393 |
| 1400 | 0.089700 | 3.154575 | 0.798398 | 0.790414 | 0.785774 | 0.798398 |
| 1500 | 0.025700 | 3.104950 | 0.799733 | 0.793144 | 0.790542 | 0.799733 |
| 1600 | 0.104200 | 3.259660 | 0.794393 | 0.786386 | 0.782162 | 0.794393 |

### Model Loading and Evaluation

Load the best model and evaluate on validation and test sets.

In [None]:
# Load the model from the specified path
loaded_model = AutoModelForSequenceClassification.from_pretrained("../models/indobert/final_model")

# Update the trainer to use the loaded model
trainer.model = loaded_model

# Evaluate on validation and test sets
val_results = trainer.evaluate(eval_dataset=val_dataset)
test_results = trainer.evaluate(eval_dataset=test_dataset)

#### Results Summary

Display and compare evaluation metrics for validation and test sets.

In [23]:
def print_results_table(val_results, test_results):
    metrics = ['eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_loss']
    header = "|| Metric           || Validation       || Test             ||"
    separator = "||------------------||------------------||------------------||"
    rows = [
        f"|| {metric:<16} || {val_results.get(metric, 'N/A'):<16.3f} || {test_results.get(metric, 'N/A'):<16.3f} ||"
        if isinstance(val_results.get(metric), float) and isinstance(test_results.get(metric), float)
        else f"|| {metric:<16} || {val_results.get(metric, 'N/A'):<16} || {test_results.get(metric, 'N/A'):<16} ||"
        for metric in metrics
    ]
    print("\n".join([header, separator] + rows))

print_results_table(val_results, test_results)

|| Metric           || Validation       || Test             ||
||------------------||------------------||------------------||
|| eval_accuracy    || 0.800            || 0.767            ||
|| eval_f1          || 0.793            || 0.761            ||
|| eval_precision   || 0.791            || 0.764            ||
|| eval_recall      || 0.800            || 0.767            ||
|| eval_loss        || 2.843            || 3.052            ||
