# # 📌 [Cell 0] - Install / Upgrade Transformers (jalankan dulu)

In [2]:
!pip install --upgrade transformers datasets scikit-learn --quiet

# 📌 [Cell 1] - Nonaktifkan logging ke wandb dan Install / Upgrade Transformers

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

# 📌 [Cell 1] - Installasi & Import Library
# Import library utama
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [4]:
df = pd.read_csv('/content/sample_data/dtst.csv')
df.head()

Unnamed: 0,context,emotional_conclusion
0,Abdulah ingin sarapan pagi dan pergi ke warung...,Cerita ini mencerminkan kebiasaan masyarakat l...
1,Anisa ingin membeli nasi gurih dan kopi.,Cerita ini menunjukkan keterikatan masyarakat ...
2,Icut memasak menu khas aceh.,Cerita ini menggambarkan bagaimana budaya loka...
3,Hasna ingin makan siang dan pergi ke warung na...,Cerita ini menyoroti pentingnya makanan tradis...
4,Rais ingin makan malam khas Aceh dan pergi ke ...,Cerita ini menyoroti pentingnya makanan tradis...


# 📌 [Cell 3] - Encode Label (emosi ke angka)

In [5]:


# Buat label numerik
label2id = {label: i for i, label in enumerate(df['emotional_conclusion'].unique())}
id2label = {i: label for label, i in label2id.items()}

df['label'] = df['emotional_conclusion'].map(label2id)
df[['context', 'emotional_conclusion', 'label']].head()

Unnamed: 0,context,emotional_conclusion,label
0,Abdulah ingin sarapan pagi dan pergi ke warung...,Cerita ini mencerminkan kebiasaan masyarakat l...,0
1,Anisa ingin membeli nasi gurih dan kopi.,Cerita ini menunjukkan keterikatan masyarakat ...,1
2,Icut memasak menu khas aceh.,Cerita ini menggambarkan bagaimana budaya loka...,2
3,Hasna ingin makan siang dan pergi ke warung na...,Cerita ini menyoroti pentingnya makanan tradis...,3
4,Rais ingin makan malam khas Aceh dan pergi ke ...,Cerita ini menyoroti pentingnya makanan tradis...,3


# 📌 [Cell 4] - Split & Konversi ke Huggingface Dataset

In [6]:


train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

# Konversi ke Huggingface Dataset
train_dataset = Dataset.from_pandas(train_df[['context', 'label']])
test_dataset = Dataset.from_pandas(test_df[['context', 'label']])


# 📌 [Cell 5] - Tokenisasi dengan IndoBERT

In [7]:

model_name = "indobenchmark/indobert-base-p1"

tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["context"], truncation=True)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/608 [00:00<?, ? examples/s]

# 📌 [Cell 6] - Load IndoBERT Model

In [8]:


num_labels = len(label2id)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 📌 [Cell 7] - TrainingArguments & Trainer

In [9]:

training_args = TrainingArguments(
    output_dir="./indo-bert-emo",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
