In [1]:
!pip install -U transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, sciki

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from google.colab import files

In [3]:
uploaded = files.upload()

In [15]:
df = pd.read_csv("Twitter_Emotion_Dataset.csv")
df = df[['tweet', 'label']].dropna()
df.columns = ['text', 'label']
df.head()


Unnamed: 0,text,label
0,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...",anger
1,"Sesama cewe lho (kayaknya), harusnya bisa lebi...",anger
2,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,happy
3,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...",anger
4,"Sharing pengalaman aja, kemarin jam 18.00 bata...",happy


PREPROCESSING & LABEL ENCODING

In [16]:
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])
label_names = le.classes_
num_labels = len(label_names)

print("Label:", list(label_names))
print("Jumlah label:", num_labels)


Label: ['anger', 'fear', 'happy', 'love', 'sadness']
Jumlah label: 5


In [17]:
dataset = Dataset.from_pandas(df[['text', 'label_id']].rename(columns={'label_id': 'label'}))
dataset = dataset.train_test_split(test_size=0.2, seed=42)

LOAD IndoBERT & Tokenizer

In [18]:
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
#tahap tokenisasi dataset

def tokenize(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/3520 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]

In [20]:
# LOAD MODEL IndoBERT
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

TRAINING MODEL

In [24]:
# Ambil sebagian data
small_train_dataset = dataset['train'].select(range(400))  # ambil 400 data untuk training
small_test_dataset = dataset['test'].select(range(100))     # ambil 100 data untuk testing


In [25]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

def compute_metrics(p):
    from sklearn.metrics import accuracy_score, f1_score
    preds = p.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [26]:
import transformers
print(transformers.__version__)


4.53.0


TRAINING MODEL

In [27]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.251636,0.48,0.466418
2,No log,1.164467,0.56,0.553483
3,No log,1.326346,0.57,0.570318
4,No log,1.442922,0.55,0.548813


TrainOutput(global_step=100, training_loss=0.7200222778320312, metrics={'train_runtime': 2314.3594, 'train_samples_per_second': 0.691, 'train_steps_per_second': 0.043, 'total_flos': 105247256985600.0, 'train_loss': 0.7200222778320312, 'epoch': 4.0})

In [28]:
trainer.evaluate()

{'eval_loss': 1.164467453956604,
 'eval_accuracy': 0.56,
 'eval_f1': 0.5534834944957917,
 'eval_runtime': 38.3607,
 'eval_samples_per_second': 2.607,
 'eval_steps_per_second': 0.182,
 'epoch': 4.0}

In [29]:
from sklearn.preprocessing import LabelEncoder

# Inisialisasi ulang LabelEncoder dan fit dengan label-label yang sama
label_encoder = LabelEncoder()
label_encoder.fit(['anger', 'fear', 'happy', 'love', 'sadness'])

In [43]:
import torch

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    label = label_encoder.inverse_transform([predicted_class])[0]
    return label

# Contoh prediksi
print(predict_emotion("Pulang udah H-4 lebaran dilema sekali. Seperti tidak bisa melakukan apa2 dirumah sebelum lebaran. Buka puasa bareng cuman 3 hari sama keluarga begitu juga sahur."))
sadness

sadness


SIMPAN


In [44]:
# Simpan sebagai file .ipynb
!jupyter nbconvert --to notebook --output="emotion_detector.ipynb" /content/Untitled0.ipynb


This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr