In [5]:
import pandas as pd

# ============================================================
# 1. LOAD & PISAHKAN DATA LABELED VS UNLABELED
# ============================================================

# Load data awal
df = pd.read_csv("/content/drive/MyDrive/ruangguru_preprocessed+250_manual_label.csv", sep=";")

# Pisahkan data yang punya manual_label
df_labeled = df[df["manual_label"].notna()]

# Pisahkan data tanpa manual_label
df_unlabeled = df[df["manual_label"].isna()]

# Simpan file
df_labeled.to_csv("data_250_manual_labeled.csv", index=False)
df_unlabeled.to_csv("data_750_unlabeled.csv", index=False)

print("Jumlah data labeled  :", len(df_labeled))
print("Jumlah data unlabeled:", len(df_unlabeled))

Jumlah data labeled  : 250
Jumlah data unlabeled: 751


In [6]:
# ============================================================
# 2. INSTALL & IMPORT LIBRARIES
# ============================================================

!pip install transformers datasets torch --quiet

import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch


In [7]:
# ============================================================
# 3. LOAD DATA LABELED UNTUK TRAINING
# ============================================================

df = pd.read_csv("/content/data_250_manual_labeled.csv")

# Mapping label ke angka
label_map = {"negatif": 0, "netral": 1, "positif": 2}
df["label"] = df["manual_label"].map(label_map)

# Dataset HuggingFace
dataset = Dataset.from_pandas(df[["cleaned_text", "label"]])

# Split 85% train, 15% test
dataset = dataset.train_test_split(test_size=0.15, seed=42)

In [8]:
# ============================================================
# 4. TOKENIZER
# ============================================================

model_name = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["cleaned_text"], padding="max_length", truncation=True, max_length=128)

dataset_tokenized = dataset.map(tokenize, batched=True)

dataset_tokenized = dataset_tokenized.remove_columns(["cleaned_text"])
dataset_tokenized.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/212 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [9]:
# ============================================================
# 5. LOAD MODEL
# ============================================================

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# ============================================================
# 6. TRAINING CONFIGURATION
# ============================================================

training_args = TrainingArguments(
    output_dir="./sentiment_bert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    report_to="none" # Added to disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
)

In [11]:
# ============================================================
# 7. TRAIN MODEL
# ============================================================

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.785028
2,No log,0.703709
3,No log,0.678089
4,No log,0.681144




TrainOutput(global_step=108, training_loss=0.49229731383147063, metrics={'train_runtime': 1351.2153, 'train_samples_per_second': 0.628, 'train_steps_per_second': 0.08, 'total_flos': 55780044558336.0, 'train_loss': 0.49229731383147063, 'epoch': 4.0})

In [12]:
# ============================================================
# 8. PREDIKSI DATA UNLABELED (750 DATA)
# ============================================================

df_unlabeled = pd.read_csv("/content/data_750_unlabeled.csv")

enc = tokenizer(
    df_unlabeled["cleaned_text"].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

with torch.no_grad():
    outputs = model(**enc)
    preds = torch.argmax(outputs.logits, dim=1).tolist()

# Mapping label angka → teks
reverse_map = {0: "negatif", 1: "netral", 2: "positif"}
df_unlabeled["predicted_label"] = [reverse_map[p] for p in preds]

# 🔥 Hapus kolom manual_label yang kosong
df_unlabeled = df_unlabeled.drop(columns=["manual_label"], errors="ignore")

# Simpan hasil prediksi
df_unlabeled.to_csv("predicted_750_self_training_sentimen_labeling.csv", index=False)

print(df_unlabeled.head())

   score                                            content  \
0      4  aplikasi nya bagus dan edukatif namun sering l...   
1      5  Bagus banget aplikasi nya, bisa belajar dari m...   
2      4  menurut saya udah bagus cuman, cara nawarin pa...   
3      4  ini apk bagus, di jelasin detail yang gada di ...   
4      5  kak, tolong yah ini aplikasi belajarnya bagus ...   

                                        cleaned_text predicted_label  
0  aplikasi bagus edukatif buka aplikasi loadingn...          netral  
1  bagus banget aplikasi ajar download ajar inter...         positif  
2  sudah bagus cuman nawar paket ajar over chat k...         negatif  
3  aplikasi bagus jelas detail gada buku materi b...         negatif  
4  tolong aplikasi ajar bagus beli langgan kurang...         negatif  
