In [None]:
# @title [STEP 1] Instalasi Library
%pip install transformers datasets accelerate scikit-learn pandas torch

In [None]:
# @title [STEP 2] Load Data & Split Train-Test
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Load Data
try:
    # Menggunakan dataset yang sudah diberi label
    df = pd.read_csv('labeled_kandidat_spam.csv', sep=';')
    
    # Pastikan format label integer dan komentar string
    df['label'] = df['label'].astype(int)
    df['comment_text'] = df['comment_text'].astype(str)
    
    # 2. Split Data (80% Train, 20% Test)
    # stratify=df['label'] penting agar rasio spam di train & test sama
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
    
    print(f"Data Loaded! Total: {len(df)}")
    print(f"Distibution Label: \n{df['label'].value_counts()}")
    print(f"Training Set: {len(train_df)} baris")
    print(f"Testing Set: {len(test_df)} baris")

except FileNotFoundError:
    print("‚ùå Error: File 'labeled_kandidat_spam.csv' tidak ditemukan. Pastikan file ada di folder yang sama!")

In [None]:
# @title [STEP 3] Tokenisasi Data
from transformers import BertTokenizer
from datasets import Dataset

# 1. Load Tokenizer IndoBERT
PRETRAINED_MODEL = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)

# 2. Convert Pandas ke HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# 3. Fungsi Tokenisasi
def tokenize_function(examples):
    return tokenizer(
        examples["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# 4. Terapkan ke Dataset
print("‚öôÔ∏è Sedang melakukan tokenisasi...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("‚úÖ Tokenisasi Selesai!")

In [None]:
# @title [STEP 4] Setup Model & Metrics
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# 1. Load Model IndoBERT untuk Klasifikasi (2 Label: Spam/Not Spam)
model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    num_labels=2
)

# 2. Fungsi Hitung Metrik
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("‚úÖ Model IndoBERT siap dilatih!")

In [None]:
# @title [STEP 5] Eksekusi Training
# Setting Training - Dioptimalkan untuk Akurasi > 85%
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,             # Ditingkatkan menjadi 15 Epoch
    per_device_train_batch_size=8,   # Batch size 8 untuk kestabilan lokal
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy", # Target utama adalah akurasi
    learning_rate=3e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

print("üöÄ MEMULAI TRAINING...")
trainer.train()
print("üéâ TRAINING SELESAI!")

In [None]:
# @title [STEP 6] Evaluasi Hasil Akhir
print("üìä Mengevaluasi Model pada Data Test...")
results = trainer.evaluate()

print("\n" + "="*30)
print("HASIL AKHIR:")
print("="*30)
print(f"üéØ Accuracy  : {results['eval_accuracy']:.4f}")
print(f"‚≠ê F1-Score  : {results['eval_f1']:.4f}")
print(f"üéØ Precision : {results['eval_precision']:.4f}")
print(f"üì° Recall    : {results['eval_recall']:.4f}")
print("="*30)

# Simpan Model Agar Bisa Dipakai Nanti
if results['eval_accuracy'] >= 0.85:
    model.save_pretrained("./indobert-spam-detection-final")
    tokenizer.save_pretrained("./indobert-spam-detection-final")
    print("‚úÖ Model berhasil mencapai target (>85%) dan tersimpan di folder 'indobert-spam-detection-final'")
else:
    print("‚ö†Ô∏è Model belum mencapai target 85%. Coba tuning lagi.")

In [None]:
# @title [STEP 7] Demo Prediksi
import torch
import torch.nn.functional as F

def predict_spam(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        pred_idx = torch.argmax(probs).item()
        confidence = probs[0][pred_idx].item()
        
    label_map = {0: "AMAN", 1: "SPAM"}
    return label_map[pred_idx], confidence

# Contoh Text
test_texts = [
    "Wah keren banget videonya, semangat terus bang!",
    "Info gacor maxwin hari ini klik link di bio",
    "Halo kak, mau tanya cara install nya gimana?",
    "Situs terpercaya deposit pulsa tanpa potongan"
]

print("--- DEMO PREDIKSI ---")
for text in test_texts:
    label, conf = predict_spam(text)
    print(f"Text: '{text}'")
    print(f"Prediksi: {label} (Yakin: {conf:.2%})\n")