<a href="https://colab.research.google.com/github/DysnomiaBorealis/Spam_SMS_Redux/blob/main/Spam_SMS_Redux.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Initialize

In [1]:
!pip install -q "transformers>=4.42" datasets evaluate scikit-learn accelerate

# ===== 1) Imports & setup =====
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer, set_seed)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np
import torch, os

set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


Load Dataset from Gdrive

In [9]:
import os
from datasets import load_dataset, DatasetDict, ClassLabel
from google.colab import drive
if not os.path.exists("/content/drive/MyDrive"):
    drive.mount("/content/drive")
else:
    print("Google Drive already mounted.")

DATA_DIR = "/content/drive/MyDrive/SMS_Spam_Data_Preprocessed"
os.makedirs(DATA_DIR, exist_ok=True)

csv_path = f"{DATA_DIR}/indo_spam.csv"

if not os.path.exists(csv_path):
    !pip install -q gdown
    !gdown --id 1IaSDTNUo6_TqkDiTrs5y9m7386Nb-NJz -O {csv_path}
else:
    print(f"Dataset already exists at {csv_path}")

raw = load_dataset("csv", data_files={"full": csv_path})
class_names = ["ham", "spam"]
raw_proc = raw["full"].map(map_label, remove_columns=raw["full"].column_names)
raw_proc = raw_proc.cast_column("label", ClassLabel(names=class_names))
raw_proc = raw_proc.train_test_split(test_size=0.2, seed=42, stratify_by_column="label")
tmp = raw_proc["train"].train_test_split(test_size=0.25, seed=42, stratify_by_column="label")

dataset = DatasetDict({
    "train": tmp["train"],
    "eval": tmp["test"],
    "test": raw_proc["test"]
})

print(dataset)

Google Drive already mounted.
Dataset already exists at /content/drive/MyDrive/SMS_Spam_Data_Preprocessed/indo_spam.csv


Casting the dataset:   0%|          | 0/2045 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'Teks'],
        num_rows: 1227
    })
    eval: Dataset({
        features: ['label', 'Teks'],
        num_rows: 409
    })
    test: Dataset({
        features: ['label', 'Teks'],
        num_rows: 409
    })
})


Load Indobert

In [10]:
MODEL_NAME = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tok(batch):
    return tokenizer(batch["Teks"], truncation=True, padding=False, max_length=256)

tokenized = dataset.map(tok, batched=True, remove_columns=["Teks"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

Map:   0%|          | 0/409 [00:00<?, ? examples/s]

Map:   0%|          | 0/409 [00:00<?, ? examples/s]

Model and Metrics

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "ham", 1: "spam"},
    label2id={"ham": 0, "spam": 1}
).to(device)

# ===== 5) Metrics =====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="binary", pos_label=1),
    }

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Training Setup

In [13]:
OUT_DIR = "/content/drive/MyDrive/SMS_Spam_Results_Clean"
args = TrainingArguments(
    output_dir=f"{OUT_DIR}/checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    report_to="none",
    fp16=torch.cuda.is_available(),
    seed=42,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_res = trainer.evaluate()
print("Eval:", eval_res)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1572,0.081614,0.982885,0.976431
2,0.0781,0.113203,0.97555,0.966667
3,0.0209,0.106278,0.982885,0.976271
4,0.0003,0.109224,0.982885,0.976271


Eval: {'eval_loss': 0.08161398023366928, 'eval_accuracy': 0.9828850855745721, 'eval_f1': 0.9764309764309764, 'eval_runtime': 0.7609, 'eval_samples_per_second': 537.527, 'eval_steps_per_second': 68.341, 'epoch': 4.0}


In [14]:
trainer.train()
eval_res = trainer.evaluate()
print("Eval:", eval_res)

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0622,0.089265,0.982885,0.976589
2,0.0345,0.093023,0.98533,0.979866
3,0.0011,0.135982,0.977995,0.969697
4,0.0001,0.132591,0.982885,0.976271


Eval: {'eval_loss': 0.09302283823490143, 'eval_accuracy': 0.9853300733496333, 'eval_f1': 0.9798657718120806, 'eval_runtime': 0.7824, 'eval_samples_per_second': 522.73, 'eval_steps_per_second': 66.46, 'epoch': 4.0}


Test Report

In [15]:
pred = trainer.predict(tokenized["test"])
y_pred = pred.predictions.argmax(axis=-1)
y_true = tokenized["test"]["label"]

print("\nClassification report (test):\n",
      classification_report(y_true, y_pred, target_names=["ham","spam"]))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))


Classification report (test):
               precision    recall  f1-score   support

         ham       0.99      0.97      0.98       260
        spam       0.95      0.99      0.97       149

    accuracy                           0.98       409
   macro avg       0.97      0.98      0.98       409
weighted avg       0.98      0.98      0.98       409

Confusion matrix:
 [[253   7]
 [  2 147]]


Save model and tokenizer

In [16]:
FINAL_DIR = f"{OUT_DIR}/final_best_model"
trainer.save_model(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)

('/content/drive/MyDrive/SMS_Spam_Results_Clean/final_best_model/tokenizer_config.json',
 '/content/drive/MyDrive/SMS_Spam_Results_Clean/final_best_model/special_tokens_map.json',
 '/content/drive/MyDrive/SMS_Spam_Results_Clean/final_best_model/vocab.txt',
 '/content/drive/MyDrive/SMS_Spam_Results_Clean/final_best_model/added_tokens.json',
 '/content/drive/MyDrive/SMS_Spam_Results_Clean/final_best_model/tokenizer.json')

Helper

In [17]:
infer_tok = AutoTokenizer.from_pretrained(FINAL_DIR)
infer_model = AutoModelForSequenceClassification.from_pretrained(FINAL_DIR).to(device)
infer_model.eval()

def predict(text: str):
    x = infer_tok(text, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        out = infer_model(**x).logits
    label_id = out.argmax(dim=-1).item()
    label = infer_model.config.id2label[label_id]
    print(f'"{text}" -> {label} [{label_id}]')
    return label_id

In [22]:
predict("Selamat! Anda mendapat hadiah 10 juta hubungi sekarang.")
predict("Rapat pindah ke jam 10 ya, terima kasih.")
predict ("Jakpot 88, topup 10k dapat 10jt")
predict ("di tokcer99 anda dapat meraup ratusan juta rupiah")
predict ("jika saya belum pulang mohon turunkan burung")

"Selamat! Anda mendapat hadiah 10 juta hubungi sekarang." -> spam [1]
"Rapat pindah ke jam 10 ya, terima kasih." -> ham [0]
"Jakpot 88, topup 10k dapat 10jt" -> spam [1]
"di tokcer99 anda dapat meraup ratusan juta rupiah" -> spam [1]
"jika saya belum pulang mohon turunkan burung" -> ham [0]


0