In [10]:
# Pastikan library terinstall
!pip install transformers datasets torch scikit-learn


Defaulting to user installation because normal site-packages is not writeable


In [18]:
import pandas as pd
import numpy as np
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [23]:
# --- 1. LOAD DATA ---
print("Loading Data...")

df_text = pd.read_csv('../Data/hasil_preprocessing_bert.csv', sep=',', on_bad_lines='skip')
df_label = pd.read_csv('../Data/hasil_pelabelan_transjakarta.csv', sep=',', on_bad_lines='skip')

df_text = df_text[['Isi Berita']].rename(columns={'Isi Berita': 'text'})
df_label = df_label[['Sentimen']].rename(columns={'Sentimen': 'label_str'})

# Samakan jumlah baris (potong jika tidak sama panjang)
min_len = min(len(df_text), len(df_label))
df = pd.concat([df_text.iloc[:min_len], df_label.iloc[:min_len]], axis=1)

# Bersihkan data kosong
df = df.dropna()

# Normalisasi Label
df['label_str'] = df['label_str'].str.lower().str.strip()

# Filter hanya label valid
valid_labels = ['positif', 'negatif', 'netral']
df = df[df['label_str'].isin(valid_labels)].copy()

print(f"Total Data Valid: {len(df)}")
print(df.head())

Loading Data...
Total Data Valid: 150
                                                text label_str
0  Dalam rangka memperingati Hari Perhubungan Nas...   positif
1  Sejumlah layananTransjakartapagi ini masih ber...   positif
2  Truk mengalami gangguan di Jalan Gatot Subroto...   negatif
3  Wakil Gubernur DKI JakartaRano Karnomenargetka...   positif
4  Bus listrik Transjakarta sempat mengalami kece...   negatif


In [24]:
# --- 2. PERSIAPAN LABEL & WEIGHTS ---
# Mapping ke Angka
label_map = {'negatif': 0, 'netral': 1, 'positif': 2}
df['label'] = df['label_str'].map(label_map)

# Hitung Bobot Kelas (Weighted Loss Strategy)
# Agar model tidak bias ke kelas mayoritas
class_counts = df['label'].value_counts().sort_index()
print("\nDistribusi Kelas:", class_counts.to_dict())

total_samples = len(df)
n_classes = 3
weights = []
for i in range(n_classes):
    count = class_counts.get(i, 0)
    # Rumus: Total / (Jumlah Kelas * Jumlah Sampel per Kelas)
    w = total_samples / (n_classes * count) if count > 0 else 1.0
    weights.append(w)

class_weights = torch.tensor(weights, dtype=torch.float)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights = class_weights.to(device)

print(f"Class Weights (Neg, Net, Pos): {class_weights}")


Distribusi Kelas: {0: 52, 1: 14, 2: 84}
Class Weights (Neg, Net, Pos): tensor([0.9615, 3.5714, 0.5952])


In [25]:
# --- 3. SPLIT DATA (80% Train, 10% Val, 10% Test) ---
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label'] # Jaga proporsi
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels
)

In [None]:
# --- 4. TOKENISASI (IndoBERT) ---
PRETRAINED_MODEL = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)

class TransJakartaDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TransJakartaDataset(train_texts, train_labels)
val_dataset = TransJakartaDataset(val_texts, val_labels)
test_dataset = TransJakartaDataset(test_texts, test_labels)

In [26]:
# --- 4. TOKENISASI & DATASET ---
PRETRAINED_MODEL = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)

class TransJakartaDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TransJakartaDataset(train_texts, train_labels)
val_dataset = TransJakartaDataset(val_texts, val_labels)
test_dataset = TransJakartaDataset(test_texts, test_labels)

In [29]:
# --- 6. TRAINING ---
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=3).to(device)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

training_args = TrainingArguments(
    output_dir='./results_final',
    num_train_epochs=4,              # 4 Epoch
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# --- 7. MENJALANKAN ---
print("Memulai Training dengan Weighted Loss...")
trainer.train()

print("\nEvaluasi Final pada Data Test:")
results = trainer.evaluate(test_dataset)
print(results)

model.save_pretrained("./model_transjakarta_weighted")
tokenizer.save_pretrained("./model_transjakarta_weighted")

Memulai Training dengan Weighted Loss...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.133426,0.066667,0.008889,0.004762,0.066667
2,1.184000,1.033478,0.6,0.671296,0.761905,0.6
3,1.053500,0.937346,0.933333,0.901754,0.873333,0.933333
4,0.904300,0.753723,0.866667,0.891667,0.955556,0.866667





Evaluasi Final pada Data Test:




{'eval_loss': 1.1611748933792114, 'eval_accuracy': 0.6, 'eval_f1': 0.541130604288499, 'eval_precision': 0.5060606060606061, 'eval_recall': 0.6, 'eval_runtime': 2.3938, 'eval_samples_per_second': 6.266, 'eval_steps_per_second': 0.418, 'epoch': 4.0}


('./model_transjakarta_weighted\\tokenizer_config.json',
 './model_transjakarta_weighted\\special_tokens_map.json',
 './model_transjakarta_weighted\\vocab.txt',
 './model_transjakarta_weighted\\added_tokens.json')