In [2]:
# 📌 [Cell 1] - Install / Upgrade Transformers (jalankan dulu)
!pip install --upgrade transformers datasets scikit-learn --quiet

In [3]:
# 📌 [Cell 1] - Nonaktifkan logging ke wandb dan Install / Upgrade Transformers
import os
os.environ["WANDB_DISABLED"] = "true"

# 📌 [Cell 1] - Installasi & Import Library
# Instalasi Huggingface dan Datasets (jika di Colab atau Jupyter)

# Import library utama
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [4]:
df = pd.read_csv('/content/sample_data/dtst.csv')
df.head()

Unnamed: 0,context,emotional_conclusion
0,Abdulah ingin sarapan pagi dan pergi ke warung...,Cerita ini mencerminkan kebiasaan masyarakat l...
1,Anisa ingin membeli nasi gurih dan kopi.,Cerita ini menunjukkan keterikatan masyarakat ...
2,Icut memasak menu khas aceh.,Cerita ini menggambarkan bagaimana budaya loka...
3,Hasna ingin makan siang dan pergi ke warung na...,Cerita ini menyoroti pentingnya makanan tradis...
4,Rais ingin makan malam khas Aceh dan pergi ke ...,Cerita ini menyoroti pentingnya makanan tradis...


In [5]:
# 📌 [Cell 3] - Encode Label (emosi ke angka)

# Buat label numerik
label2id = {label: i for i, label in enumerate(df['emotional_conclusion'].unique())}
id2label = {i: label for label, i in label2id.items()}

df['label'] = df['emotional_conclusion'].map(label2id)
df[['context', 'emotional_conclusion', 'label']].head()

Unnamed: 0,context,emotional_conclusion,label
0,Abdulah ingin sarapan pagi dan pergi ke warung...,Cerita ini mencerminkan kebiasaan masyarakat l...,0
1,Anisa ingin membeli nasi gurih dan kopi.,Cerita ini menunjukkan keterikatan masyarakat ...,1
2,Icut memasak menu khas aceh.,Cerita ini menggambarkan bagaimana budaya loka...,2
3,Hasna ingin makan siang dan pergi ke warung na...,Cerita ini menyoroti pentingnya makanan tradis...,3
4,Rais ingin makan malam khas Aceh dan pergi ke ...,Cerita ini menyoroti pentingnya makanan tradis...,3


In [6]:
# 📌 [Cell 4] - Split & Konversi ke Huggingface Dataset

train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

# Konversi ke Huggingface Dataset
train_dataset = Dataset.from_pandas(train_df[['context', 'label']])
test_dataset = Dataset.from_pandas(test_df[['context', 'label']])


In [7]:
# 📌 [Cell 5] - Tokenisasi dengan IndoBERT

model_name = "indobenchmark/indobert-base-p1"

tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["context"], truncation=True)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/608 [00:00<?, ? examples/s]

In [8]:
# 📌 [Cell 6] - Load IndoBERT Model

num_labels = len(label2id)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 📌 [Cell 7] - TrainingArguments & Trainer

training_args = TrainingArguments(
    output_dir="./indo-bert-emo",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [10]:
# 📌 [Cell 8] - Training Model


trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0666,0.015983
2,0.0004,0.037562
3,0.0002,0.027775
4,0.0003,0.023515
5,0.0002,0.029079


TrainOutput(global_step=2280, training_loss=0.03711579437163381, metrics={'train_runtime': 6022.1232, 'train_samples_per_second': 1.512, 'train_steps_per_second': 0.379, 'total_flos': 85520337001320.0, 'train_loss': 0.03711579437163381, 'epoch': 5.0})

In [11]:
# 📌 [Cell 9] - Evaluasi Model

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

predictions = trainer.predict(test_tokenized)
y_true = test_df['label']
y_pred = predictions.predictions.argmax(-1)

# Cetak classification report
print(classification_report(y_true, y_pred, target_names=label2id.keys()))

# Hitung Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Hitung Metrik Lainnya
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted') # Gunakan 'weighted', 'macro', atau 'micro'
recall = recall_score(y_true, y_pred, average='weighted')     # sesuai kebutuhan Anda
f1 = f1_score(y_true, y_pred, average='weighted')             # untuk dataset yang tidak seimbang

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1-Score (Weighted): {f1:.4f}")

                                                                                                               precision    recall  f1-score   support

Cerita ini mencerminkan kebiasaan masyarakat lokal dalam bersosialisasi di warung kopi sebagai bagian budaya.       0.00      0.00      0.00         0
     Cerita ini menunjukkan keterikatan masyarakat terhadap budaya lokal melalui konsumsi produk khas daerah.       1.00      1.00      1.00        17
            Cerita ini menggambarkan bagaimana budaya lokal tercermin dalam kehidupan sehari-hari masyarakat.       1.00      1.00      1.00       541
             Cerita ini menyoroti pentingnya makanan tradisional sebagai bagian dari identitas budaya daerah.       1.00      0.98      0.99        50

                                                                                                     accuracy                           1.00       608
                                                                                            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# 📌 [Cell 10] - Prediksi Teks Baru

def predict_emosi(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return id2label[pred]

# Contoh
predict_emosi("wow.")


'Cerita ini menggambarkan bagaimana budaya lokal tercermin dalam kehidupan sehari-hari masyarakat.'