In [1]:
!pip install -r requirements.txt


Collecting torch>=2.2.0 (from -r requirements.txt (line 1))
  Using cached torch-2.9.1-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting transformers==4.30.2 (from -r requirements.txt (line 2))
  Using cached transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
Collecting pandas==2.0.3 (from -r requirements.txt (line 3))
  Using cached pandas-2.0.3.tar.gz (5.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: still running...
  Getting requirements to build wheel: still running...
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numpy==1.24.3 (from -r requirements.txt (line 4))
  Using cached numpy-1.24.3.tar.gz (10.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finis

  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [33 lines of output]
      Traceback (most recent call last):
        File "D:\Anaconda\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 389, in <module>
          main()
        File "D:\Anaconda\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 373, in main
          json_out["return_val"] = hook(**hook_input["kwargs"])
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "D:\Anaconda\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 137, in get_requires_for_build_wheel
          backend = _build_backend()
                    ^^^^^^^^^^^^^^^^
        File "D:\Anaconda\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 70, in _build_backend
          obj = import_module(mod_path)
                ^^^^^^^^^^^^^^^^^^^^

In [3]:
"""
Notebook 3: Model Training dengan IndoBERT
Optimized training untuk deteksi cyberbullying
"""

# =====================================================
# IMPORT LIBRARIES
# =====================================================
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup
)

from torch.optim import AdamW
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# =====================================================
# SETUP DEVICE
# =====================================================
print("=== SETUP DEVICE ===")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device used: {device}")

# =====================================================
# HYPERPARAMETERS (OPTIMIZED)
# =====================================================
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
WARMUP_RATIO = 0.1
EARLY_STOPPING_PATIENCE = 2

# =====================================================
# LOAD DATA
# =====================================================
print("\n=== LOADING DATA ===")
train_df = pd.read_csv('../data/processed/train.csv')
val_df   = pd.read_csv('../data/processed/val.csv')
test_df  = pd.read_csv('../data/processed/test.csv')

print(f"Train: {len(train_df)}")
print(f"Val  : {len(val_df)}")
print(f"Test : {len(test_df)}")

# =====================================================
# CLASS WEIGHT (IMPORTANT)
# =====================================================
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

print(f"Class Weights: {class_weights}")

# =====================================================
# DATASET CLASS
# =====================================================
class CyberbullyingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


ModuleNotFoundError: No module named 'pandas'

In [None]:

# =====================================================
# LOAD TOKENIZER & MODEL
# =====================================================
print("\n=== LOADING INDOBERT ===")
MODEL_NAME = "indobenchmark/indobert-base-p1"

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

model.to(device)
print("✓ IndoBERT loaded")

# =====================================================
# DATALOADERS
# =====================================================
train_dataset = CyberbullyingDataset(
    train_df['cleaned_comment'].values,
    train_df['label'].values,
    tokenizer,
    MAX_LEN
)

val_dataset = CyberbullyingDataset(
    val_df['cleaned_comment'].values,
    val_df['label'].values,
    tokenizer,
    MAX_LEN
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches  : {len(val_loader)}")



=== LOADING INDOBERT ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ IndoBERT loaded
Train batches: 66
Val batches  : 15


In [None]:
# =====================================================
# OPTIMIZER & SCHEDULER
# =====================================================
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# =====================================================
# TRAIN & EVAL FUNCTIONS
# =====================================================
def train_epoch(model, data_loader):
    model.train()
    losses = []
    correct = 0
    total = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs.logits, labels)
        preds = torch.argmax(outputs.logits, dim=1)

        correct += torch.sum(preds == labels)
        total += labels.size(0)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    return correct.double() / total, np.mean(losses)


def eval_model(model, data_loader):
    model.eval()
    losses = []
    preds_all = []
    labels_all = []
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = loss_fn(outputs.logits, labels)
            preds = torch.argmax(outputs.logits, dim=1)

            correct += torch.sum(preds == labels)
            total += labels.size(0)

            losses.append(loss.item())
            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    acc = correct.double() / total
    f1 = f1_score(labels_all, preds_all, average='weighted')

    return acc.item(), f1, np.mean(losses)

: 

In [None]:



# =====================================================
# TRAINING LOOP WITH EARLY STOPPING
# =====================================================
print("\n=== START TRAINING ===")

history = {
    'train_acc': [],
    'train_loss': [],
    'val_acc': [],
    'val_f1': [],
    'val_loss': []
}

best_val_f1 = 0
patience_counter = 0

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{EPOCHS} ---")

    train_acc, train_loss = train_epoch(model, train_loader)
    val_acc, val_f1, val_loss = eval_model(model, val_loader)

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val   Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")

    history['train_acc'].append(train_acc.item())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)
    history['val_loss'].append(val_loss)

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), '../models/best_model.pt')
        patience_counter = 0
        print(f"✓ Best model saved (Val F1 = {val_f1:.4f})")
    else:
        patience_counter += 1
        print(f"EarlyStopping {patience_counter}/{EARLY_STOPPING_PATIENCE}")

    if patience_counter >= EARLY_STOPPING_PATIENCE:
        print("⛔ Early stopping triggered")
        break

print("\n=== TRAINING FINISHED ===")
print(f"Best Validation F1: {best_val_f1:.4f}")

# =====================================================
# SAVE TRAINING HISTORY
# =====================================================
history_df = pd.DataFrame(history)
history_df.to_csv('../results/metrics/training_history.csv', index=False)
print("✓ Training history saved")

# =====================================================
# PLOT TRAINING HISTORY
# =====================================================
plt.figure(figsize=(14,5))

plt.subplot(1,2,1)
plt.plot(history['train_acc'], label='Train Acc')
plt.plot(history['val_acc'], label='Val Acc')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1,2,2)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/training_history.png', dpi=300)
plt.show()

print("\n➡️ Lanjut ke Notebook 04: Evaluation Metrics")



=== START TRAINING ===

--- Epoch 1/5 ---


Training: 100%|██████████| 66/66 [23:46<00:00, 21.62s/it]
Validation: 100%|██████████| 15/15 [01:12<00:00,  4.85s/it]


Train Loss: 0.6797 | Train Acc: 0.5782
Val   Loss: 0.6413 | Val Acc: 0.5841 | Val F1: 0.5800
✓ Best model saved (Val F1 = 0.5800)

--- Epoch 2/5 ---


Training: 100%|██████████| 66/66 [28:54<00:00, 26.28s/it]
Validation: 100%|██████████| 15/15 [01:41<00:00,  6.74s/it]


Train Loss: 0.5135 | Train Acc: 0.7517
Val   Loss: 0.6692 | Val Acc: 0.6814 | Val F1: 0.6788
✓ Best model saved (Val F1 = 0.6788)

--- Epoch 3/5 ---


Training:  27%|██▋       | 18/66 [09:51<28:51, 36.08s/it]