In [33]:
# Import library yang diperlukan
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [34]:
# 1. Load dataset
ds = load_dataset("dwisaji/indonesia-telecomunication-sentiment-dataset")

In [36]:
# 2. Konversi kolom label ke ClassLabel
label_names = ['Negatif', 'Netral', 'Positif']
ds = ds.cast_column('label', ClassLabel(names=label_names))

In [37]:
# 3. Split dataset dengan stratifikasi
train_val_split = ds['train'].train_test_split(
    test_size=0.2,
    seed=42,
    stratify_by_column='label'
)
train_data = train_val_split['train']
val_data = train_val_split['test']



In [65]:
val_data.to_csv('val_data.csv',index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 212.02ba/s]


27517

In [38]:
# 4. Tokenisasi
model_name = "indolem/indobert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)


tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_val = val_data.map(preprocess_function, batched=True)


In [39]:
tokenized_train = tokenized_train.rename_column('label', 'labels')
tokenized_val = tokenized_val.rename_column('label', 'labels')

In [40]:
tokenized_train = tokenized_train.remove_columns(['text'])
tokenized_val = tokenized_val.remove_columns(['text'])

In [41]:
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

In [42]:
print("Kolom tokenized_train:", tokenized_train.column_names)
print("Kolom tokenized_val:", tokenized_val.column_names)
print()
print(f"Jumlah data training: {len(tokenized_train)}")
print(f"Jumlah data validasi: {len(tokenized_val)}")

Kolom tokenized_train: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
Kolom tokenized_val: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']

Jumlah data training: 1977
Jumlah data validasi: 495


In [43]:
print("Contoh data training:")
print(tokenized_train[0])  # Sekarang harus menampilkan data
print("Contoh data validasi:")
print(tokenized_val[0])

Contoh data training:
{'labels': tensor(0), 'input_ids': tensor([    3, 24816,  5805,  3151, 25033,   935,  4143, 23579,   931, 13729,
            4]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}
Contoh data validasi:
{'labels': tensor(2), 'input_ids': tensor([    3,  3353,  1522, 22603,   934,     4]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1])}


In [46]:
# 6. Konfigurasi model
id_to_labels = {0: 'Positif', 1: 'Netral', 2: 'Negatif'}
label_to_id = {'Positif': 0, 'Netral': 1, 'Negatif': 2}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id_to_labels,
    label2id=label_to_id
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
sample = val_data[0]
print(f"Contoh teks: {sample['text']}")
print(f"Label asli (numerik): {sample['label']}")
print(f"Label asli (nama): {id_to_labels[sample['label']]}")

Contoh teks: erorrrr
Label asli (numerik): 2
Label asli (nama): Negatif


In [11]:
# 7. Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=preds, references=labels)['accuracy']
    f1_score = f1.compute(predictions=preds, references=labels, average='weighted')['f1']
    return {"accuracy": acc, "f1": f1_score}

In [14]:

# 9. Konfigurasi training
training_args = TrainingArguments(
    output_dir="indobert-sentiment-3class",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)



In [16]:
# 10. Inisialisasi Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [17]:
# 11. Jalankan training
trainer.train()

# 12. Evaluasi akhir
print("Evaluasi pada data validasi:")
results = trainer.evaluate()
print(f"Hasil Evaluasi: {results}")

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.91766,0.606061,0.550422
2,No log,0.88126,0.648485,0.591184
3,No log,0.75502,0.684848,0.645486
4,No log,0.757981,0.692929,0.656836
5,0.792100,0.75978,0.692929,0.659129


Evaluasi pada data validasi:


Hasil Evaluasi: {'eval_loss': 0.7597801685333252, 'eval_accuracy': 0.692929292929293, 'eval_f1': 0.6591292103931533, 'eval_runtime': 20.7868, 'eval_samples_per_second': 23.813, 'eval_steps_per_second': 1.491, 'epoch': 5.0}


In [52]:
# Contoh prediksi
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = np.exp(outputs.logits.detach().numpy())
    probs = probs / probs.sum(axis=-1, keepdims=True)
    pred_id = np.argmax(probs)
    return id_to_labels[pred_id], probs[0][pred_id]

# Test contoh yang bermasalah
text = "oi ngerti bahasa indonesia kan pokoknya sampe jam belon beres gua gratis jam mokad gratis dst dst lu gila mati service melulu jam berkali &amp"
prediction, confidence = predict(text)
print(f"\nText: {text}")
print(f"Prediksi: {prediction} (Confidence: {confidence:.2f})")


Text: oi ngerti bahasa indonesia kan pokoknya sampe jam belon beres gua gratis jam mokad gratis dst dst lu gila mati service melulu jam berkali &amp
Prediksi: Netral (Confidence: 0.39)


In [64]:
# Ambil sampel teks dari dataset validasi
sample = val_data[206]  # Contoh pertama
text_sample = sample['text']
label_sample = sample['label']

print(f"Contoh teks: {text_sample}")
print(f"Label asli: {id_to_labels[label_sample]}")


prediction, confidence = predict(text_sample)
print(len(sample['text']))
print(f"\nText: {text_sample}")
print(f"Label Asli: {id_to_labels[label_sample]}")
print(f"Prediksi: {prediction} (Confidence: {confidence:.2f})")


Contoh teks: oi ngerti bahasa indonesia kan pokoknya sampe jam belon beres gua gratis jam mokad gratis dst dst lu gila mati service melulu jam berkali &amp
Label asli: Negatif
142

Text: oi ngerti bahasa indonesia kan pokoknya sampe jam belon beres gua gratis jam mokad gratis dst dst lu gila mati service melulu jam berkali &amp
Label Asli: Negatif
Prediksi: Netral (Confidence: 0.39)


## Evaluasi data dengan eval_data

In [21]:
train_val_split = ds['train'].train_test_split(
    test_size=0.2,
    seed=42,
    stratify_by_column='label'
)
train_data = train_val_split['train']
val_data = train_val_split['test']  # Ini adalah data validasi Anda

In [22]:
tokenized_val = val_data.map(preprocess_function, batched=True)
tokenized_val = tokenized_val.rename_column('label', 'labels')
tokenized_val = tokenized_val.remove_columns(['text'])
tokenized_val.set_format("torch")

Map: 100%|██████████| 495/495 [00:00<00:00, 2577.23 examples/s]


In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Gunakan data validasi Anda di sini
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [24]:
# Evaluasi pada data validasi
results = trainer.evaluate()
print(f"Hasil Evaluasi pada Data Validasi: {results}")

# Jika ingin prediksi manual
predictions = trainer.predict(tokenized_val)
logits, labels = predictions.predictions, predictions.label_ids
metrics = compute_metrics((logits, labels))
print(f"Hasil Evaluasi Manual: {metrics}")

Hasil Evaluasi pada Data Validasi: {'eval_loss': 0.7597801685333252, 'eval_model_preparation_time': 0.0081, 'eval_accuracy': 0.692929292929293, 'eval_f1': 0.6591292103931533, 'eval_runtime': 20.987, 'eval_samples_per_second': 23.586, 'eval_steps_per_second': 1.477}
Hasil Evaluasi Manual: {'accuracy': 0.692929292929293, 'f1': 0.6591292103931533}
