In [1]:
pip install torch transformers datasets scikit-learn pandas




In [4]:
import pandas as pd
import numpy as np

df = pd.read_excel("Data_Manual.xlsx")

df.columns = df.columns.str.lower()
df['label'] = df['label'].astype(str).str.strip()
df['label'].replace({'nan': np.nan, '' : np.nan}, inplace=True)

labeled = df[df['label'].notna()]
unlabeled = df[df['label'].isna()]

print("Data berlabel :", len(labeled))
print("Data tidak berlabel :", len(unlabeled))


Data berlabel : 251
Data tidak berlabel : 817


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].replace({'nan': np.nan, '' : np.nan}, inplace=True)


In [5]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

lbl_enc = LabelEncoder()
labeled['label_id'] = lbl_enc.fit_transform(labeled['label'])

dataset = Dataset.from_pandas(labeled[['review', 'label_id']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled['label_id'] = lbl_enc.fit_transform(labeled['label'])


In [15]:
from transformers import AutoTokenizer
from datasets import Features, ClassLabel, Value # Import Features and ClassLabel

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

def tokenize(batch):
    return tokenizer(batch['review'], truncation=True, padding='max_length', max_length=128)

# Re-initialize dataset from 'labeled' to ensure a consistent starting state
dataset = Dataset.from_pandas(labeled[['review', 'label_id']])

# Conditionally remove the '__index_level_0__' column if it exists
if "__index_level_0__" in dataset.column_names:
    dataset = dataset.remove_columns(["__index_level_0__"])

# Cast 'label_id' to ClassLabel type for stratification and apply other features
features = Features({
    'review': Value(dtype='string'),
    'label_id': ClassLabel(names=lbl_enc.classes_.tolist()) # Convert numpy array to list
})
dataset = dataset.cast(features) # Apply the new features schema

dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label_id")
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(["review"])
# Rename 'label_id' to 'labels' as expected by the model for training
dataset = dataset.rename_column("label_id", "labels")
dataset.set_format("torch")

Casting the dataset:   0%|          | 0/251 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=len(lbl_enc.classes_)
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
args = TrainingArguments(
    output_dir="./sentiment-bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()



Step,Training Loss




TrainOutput(global_step=52, training_loss=0.6594502375676081, metrics={'train_runtime': 1257.8203, 'train_samples_per_second': 0.636, 'train_steps_per_second': 0.041, 'total_flos': 52622683545600.0, 'train_loss': 0.6594502375676081, 'epoch': 4.0})

In [18]:
from torch.nn.functional import softmax

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    label_id = torch.argmax(probs).item()
    confidence = torch.max(probs).item()
    return label_id, confidence

pred_labels = []
conf_scores = []

for text in unlabeled['review']:
    label, conf = predict(text)
    pred_labels.append(label)
    conf_scores.append(conf)

unlabeled['label_id'] = pred_labels
unlabeled['confidence'] = conf_scores


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled['label_id'] = pred_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled['confidence'] = conf_scores


In [21]:
threshold = 0.70
unlabeled.loc[unlabeled['confidence'] >= threshold, 'label'] = \
    lbl_enc.inverse_transform(unlabeled.loc[unlabeled['confidence'] >= threshold, 'label_id'])


In [25]:
print("Jumlah data yang sudah terlabel:", len(labeled))

Jumlah data yang sudah terlabel: 251


In [28]:
args = TrainingArguments(
    output_dir="./sentiment-bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()

# Explicitly save the model and tokenizer after training
trainer.save_model(args.output_dir)
tokenizer.save_pretrained(args.output_dir)



Step,Training Loss




('./sentiment-bert/tokenizer_config.json',
 './sentiment-bert/special_tokens_map.json',
 './sentiment-bert/vocab.txt',
 './sentiment-bert/added_tokens.json',
 './sentiment-bert/tokenizer.json')

In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

# Load model yang sudah dilatih
model_path = "./sentiment-bert"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Baca dataset
df = pd.read_excel("Data_Manual.xlsx")

# Mapping label (ubah sesuai label saat training)
label_map = {0: "Negatif", 1: "Netral", 2: "Positif"}

# Fungsi prediksi
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        label = torch.argmax(probs).item()
        confidence = float(probs[0][label])
    return label, confidence

# Pastikan kolom text ada
if 'text' not in df.columns:
    if 'review' in df.columns:
        df.rename(columns={'review': 'text'}, inplace=True)
    else:
        raise ValueError("Kolom 'text' atau 'review' tidak ditemukan!")

# Tambahkan kolom hasil prediksi
df["predicted_label_id"] = ""
df["predicted_label"] = ""
df["confidence"] = ""

# Loop untuk prediksi tiap baris
for idx, row in df.iterrows():
    label_id, conf = predict(row["text"])
    df.at[idx, "predicted_label_id"] = label_id
    df.at[idx, "predicted_label"] = label_map[label_id]
    df.at[idx, "confidence"] = round(conf, 4)  # biar rapi 4 desimal

# Simpan hasil
output_file = "hasil_final.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"✔️ Semua data berhasil terlabeli → {output_file}")


✔️ Semua data berhasil terlabeli → hasil_final.csv
