MODEL IndoBERT

In [2]:
pip install torch transformers datasets scikit-learn pandas




In [3]:
import pandas as pd
import numpy as np

df = pd.read_excel("Data_Manual.xlsx")

df.columns = df.columns.str.lower()
df['label'] = df['label'].astype(str).str.strip()
df['label'].replace({'nan': np.nan, '' : np.nan}, inplace=True)

labeled = df[df['label'].notna()]
unlabeled = df[df['label'].isna()]

print("Data berlabel :", len(labeled))
print("Data tidak berlabel :", len(unlabeled))


Data berlabel : 251
Data tidak berlabel : 817


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].replace({'nan': np.nan, '' : np.nan}, inplace=True)


In [4]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

lbl_enc = LabelEncoder()
labeled['label_id'] = lbl_enc.fit_transform(labeled['label'])

dataset = Dataset.from_pandas(labeled[['review', 'label_id']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled['label_id'] = lbl_enc.fit_transform(labeled['label'])


In [5]:
from transformers import AutoTokenizer
from datasets import Features, ClassLabel, Value # Import Features and ClassLabel

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

def tokenize(batch):
    return tokenizer(batch['review'], truncation=True, padding='max_length', max_length=128)

# Re-initialize dataset from 'labeled' to ensure a consistent starting state
dataset = Dataset.from_pandas(labeled[['review', 'label_id']])

# Conditionally remove the '__index_level_0__' column if it exists
if "__index_level_0__" in dataset.column_names:
    dataset = dataset.remove_columns(["__index_level_0__"])

# Cast 'label_id' to ClassLabel type for stratification and apply other features
features = Features({
    'review': Value(dtype='string'),
    'label_id': ClassLabel(names=lbl_enc.classes_.tolist()) # Convert numpy array to list
})
dataset = dataset.cast(features) # Apply the new features schema

dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label_id")
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(["review"])
# Rename 'label_id' to 'labels' as expected by the model for training
dataset = dataset.rename_column("label_id", "labels")
dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Casting the dataset:   0%|          | 0/251 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=len(lbl_enc.classes_)
)


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
args = TrainingArguments(
    output_dir="./sentiment-bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmawarpratama1204[0m ([33mmawarpratama1204-universitas-teknologi-yogyakarta[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss




TrainOutput(global_step=52, training_loss=0.5516298734224759, metrics={'train_runtime': 1532.2497, 'train_samples_per_second': 0.522, 'train_steps_per_second': 0.034, 'total_flos': 52622683545600.0, 'train_loss': 0.5516298734224759, 'epoch': 4.0})

In [8]:
from torch.nn.functional import softmax

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    label_id = torch.argmax(probs).item()
    confidence = torch.max(probs).item()
    return label_id, confidence

pred_labels = []
conf_scores = []

for text in unlabeled['review']:
    label, conf = predict(text)
    pred_labels.append(label)
    conf_scores.append(conf)

unlabeled['label_id'] = pred_labels
unlabeled['confidence'] = conf_scores


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled['label_id'] = pred_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled['confidence'] = conf_scores


In [9]:
threshold = 0.70
unlabeled.loc[unlabeled['confidence'] >= threshold, 'label'] = \
    lbl_enc.inverse_transform(unlabeled.loc[unlabeled['confidence'] >= threshold, 'label_id'])


In [10]:
print("Jumlah data yang sudah terlabel:", len(labeled))

Jumlah data yang sudah terlabel: 251


In [11]:
args = TrainingArguments(
    output_dir="./sentiment-bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()

# Explicitly save the model and tokenizer after training
trainer.save_model(args.output_dir)
tokenizer.save_pretrained(args.output_dir)



Step,Training Loss




('./sentiment-bert/tokenizer_config.json',
 './sentiment-bert/special_tokens_map.json',
 './sentiment-bert/vocab.txt',
 './sentiment-bert/added_tokens.json',
 './sentiment-bert/tokenizer.json')

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

# Load model yang sudah dilatih
model_path = "./sentiment-bert"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Baca dataset
df = pd.read_excel("Data_Manual.xlsx")

# Mapping label (ubah sesuai label saat training)
label_map = {0: "Negatif", 1: "Netral", 2: "Positif"}

# Fungsi prediksi
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        label = torch.argmax(probs).item()
        confidence = float(probs[0][label])
    return label, confidence

# Pastikan kolom text ada
if 'text' not in df.columns:
    if 'review' in df.columns:
        df.rename(columns={'review': 'text'}, inplace=True)
    else:
        raise ValueError("Kolom 'text' atau 'review' tidak ditemukan!")

# Tambahkan kolom hasil prediksi
df["predicted_label_id"] = ""
df["predicted_label"] = ""
df["confidence"] = ""

# Loop untuk prediksi tiap baris
for idx, row in df.iterrows():
    label_id, conf = predict(row["text"])
    df.at[idx, "predicted_label_id"] = label_id
    df.at[idx, "predicted_label"] = label_map[label_id]
    df.at[idx, "confidence"] = round(conf, 4)  # biar rapi 4 desimal

# Simpan hasil
output_file = "hasil_final.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"✔️ Semua data berhasil terlabeli → {output_file}")


✔️ Semua data berhasil terlabeli → hasil_final.csv


LLM Groq (llama3.1-8b-instant)

In [13]:
!pip install transformers torch pandas groq


Collecting groq
  Downloading groq-0.37.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.37.1-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.37.1


In [14]:
from transformers import AutoModelForSequenceClassification, BertTokenizerFast
from groq import Groq
import pandas as pd
import torch
import time


In [15]:
model_path = "./sentiment-bert"

tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

label_map = {0: "negatif", 1: "netral", 2: "positif"}


In [16]:
df = pd.read_csv("hasil_final.csv")

print(df.head())


                                                text    Label       .1  .2  \
0  Harganya naik menjadi 40k krn liburan. Anak2 b...  Negatif  NaN           
1  Tempat wisata keluarga yang cocok juga untuk e...  Positif  NaN           
2  Tempat ini bagus untuk berlibur bersama keluar...  Positif  NaN           
3  Tempat ini lebih ke mini zoo ya sama spot foto...  Positif  NaN           
4  Tempat mudah ditemukan karena petunjuk jalan y...      NaN  NaN           

   .3  .4  .5  .6  predicted_label_id predicted_label  confidence  
0                                   0         Negatif      0.9946  
1                                   2         Positif      0.9927  
2                                   1          Netral      0.9441  
3                                   2         Positif      0.9271  
4                                   2         Positif      0.9705  


In [17]:
client = Groq(api_key="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")


In [18]:
def predict_indobert(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**tokens).logits

    prediction = torch.argmax(logits).item()
    return label_map[prediction]


In [19]:
def refine_with_llm(text, model_label):
    prompt = f"""
Teks: "{text}"

Prediksi awal model: {model_label}

Perbaiki jika salah, jika benar tetap.
Jawab dalam format JSON:
{{
  "label_final": "positif/negatif/netral",
  "alasan": "penjelasan singkat"
}}
"""

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content


In [20]:
results = []

for i, text in enumerate(df["text"]):
    print(f"[{i+1}/{len(df)}] Processing...")

    # Ambil prediksi IndoBERT
    bert_label = predict_indobert(text)
    success = False
    while not success:
        try:
            refined_label = refine_with_llm(text, bert_label)
            success = True
        except Exception as e:
            print("⚠ Rate limit, retrying in 5s...")
            time.sleep(5)

    results.append({
        "text": text,
        "label_bert": bert_label,
        "label_llm": refined_label
    })

    time.sleep(2)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[1/1068] Processing...
[2/1068] Processing...
[3/1068] Processing...
[4/1068] Processing...
[5/1068] Processing...
[6/1068] Processing...
[7/1068] Processing...
[8/1068] Processing...
[9/1068] Processing...
[10/1068] Processing...
[11/1068] Processing...
[12/1068] Processing...
[13/1068] Processing...
[14/1068] Processing...
[15/1068] Processing...
[16/1068] Processing...
[17/1068] Processing...
[18/1068] Processing...
[19/1068] Processing...
[20/1068] Processing...
[21/1068] Processing...
[22/1068] Processing...
[23/1068] Processing...
[24/1068] Processing...
[25/1068] Processing...
[26/1068] Processing...
[27/1068] Processing...
[28/1068] Processing...
[29/1068] Processing...
[30/1068] Processing...
[31/1068] Processing...
[32/1068] Processing...
[33/1068] Processing...
[34/1068] Processing...
[35/1068] Processing...
[36/1068] Processing...
[37/1068] Processing...
[38/1068] Processing...
[39/1068] Processing...
[40/1068] Processing...
[41/1068] Processing...
[42/1068] Processing...
[

In [21]:
output_df = pd.DataFrame(results)
output_df.to_csv("self_training.csv", index=False)

print("DONE — saved as self_training.csv")


DONE — saved as self_training.csv


In [None]:
import json
import torch
import time
import pandas as pd

#  IndoBERT prediction with confidence
def predict_indobert(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        logits = model(**tokens).logits

    # Softmax → confidence score
    probs = torch.softmax(logits, dim=1).numpy()[0]
    pred_idx = torch.argmax(logits).item()
    predicted_label = label_map[pred_idx]
    confidence = float(probs[pred_idx])

    return predicted_label, confidence


def refine_with_llm(text, bert_label):
    prompt = f"""
Teks: "{text}"

Prediksi awal model: {bert_label}

Periksa apakah label benar. Jika salah, ubah.
Jawab dalam format JSON:
{{
  "label_final": "positif / negatif / netral",
  "alasan": "penjelasan singkat"
}}
"""

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}]
    )

    content = response.choices[0].message.content

    try:
        parsed = json.loads(content)
        final_label = parsed.get("label_final", "").strip().lower()
        alasan = parsed.get("alasan", "")
    except:
        final_label = "ERROR"
        alasan = "Format LLM tidak sesuai JSON"

    return final_label, alasan, content


# Loop processing
results = []

for i, text in enumerate(df["text"]):
    print(f"[{i+1}/{len(df)}] Processing...")

    indobert_label, conf_score = predict_indobert(text)

    # Retry mechanism if rate limit
    success = False
    while not success:
        try:
            final_label, reason, raw_llm = refine_with_llm(text, indobert_label)
            success = True
        except Exception as e:
            print("⚠ Rate limit or connection error. Retrying in 5s...")
            time.sleep(5)

    # Store result
    results.append({
        "text": text,
        "label_indobert": indobert_label,
        "confidence": round(conf_score, 4),
        "label_llm_final": final_label,
        "alasan_llm": reason,
        "raw_llm_output": raw_llm
    })

    time.sleep(2)

output_df = pd.DataFrame(results)
output_df.to_csv("self_training_enhanced.csv", index=False)

print("DONE — File saved as self_training_enhanced.csv")


[1/1068] Processing...
[2/1068] Processing...
[3/1068] Processing...
[4/1068] Processing...
[5/1068] Processing...
[6/1068] Processing...
[7/1068] Processing...
[8/1068] Processing...
[9/1068] Processing...
[10/1068] Processing...
[11/1068] Processing...
[12/1068] Processing...
[13/1068] Processing...
[14/1068] Processing...
[15/1068] Processing...
[16/1068] Processing...
[17/1068] Processing...
[18/1068] Processing...
[19/1068] Processing...
[20/1068] Processing...
[21/1068] Processing...
[22/1068] Processing...
[23/1068] Processing...
[24/1068] Processing...
[25/1068] Processing...
[26/1068] Processing...
[27/1068] Processing...
[28/1068] Processing...
[29/1068] Processing...
[30/1068] Processing...
[31/1068] Processing...
[32/1068] Processing...
[33/1068] Processing...
[34/1068] Processing...
[35/1068] Processing...
[36/1068] Processing...
[37/1068] Processing...
[38/1068] Processing...
[39/1068] Processing...
[40/1068] Processing...
[41/1068] Processing...
[42/1068] Processing...
[

In [None]:
import pandas as pd

df = pd.read_csv("self_training_enhanced.csv")

print(df.head())

import pandas as pd
from tabulate import tabulate

# Menampilkan tabel rapi
print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

# Simpan ke CSV baru dengan format rapi
df.to_csv("self_training_.csv", index=False)

print("\n✔ File rapi telah disimpan sebagai: self_training_.csv")

