In [None]:
import datasets, fsspec
print("datasets version:", datasets.__version__)
print("fsspec version:", fsspec.__version__)

datasets version: 3.6.0
fsspec version: 2025.3.0


In [None]:
pip install -U datasets



In [None]:
import pandas as pd
import tensorflow as tf
from datasets import Dataset, load_dataset, DatasetDict, ClassLabel
from transformers import BertTokenizer, TFAutoModelForSequenceClassification

# === A. Bersihkan CSV (hilangkan baris yang kosong atau NaN) ===
df = pd.read_csv('/content/news.csv')

# Hapus baris di mana 'text_final' atau 'Status_Normalized' kosong atau NaN
df = df.dropna(subset=["text_final", "Status_Normalized"])
df = df[df["text_final"].str.strip() != ""]
df = df[df["Status_Normalized"].str.strip() != ""]

# Simpan CSV bersih ke file baru
clean_csv_path = "/content/data_clean.csv"
df.to_csv(clean_csv_path, index=False)

In [None]:
# === B. Muat Dataset dari CSV dan Lakukan Label Encoding ===
# Muat CSV bersih sebagai Hugging Face Dataset
raw_ds = load_dataset("csv", data_files={"train": clean_csv_path})

# maka gunakan ClassLabel agar otomatis memapping string ke integer
label_feature = ClassLabel(names=["HOAX", "FAKTA"])
raw_ds = raw_ds.cast_column("Status_Normalized", label_feature)

# Bagi menjadi train dan validation (90:10)
split = raw_ds["train"].train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({
    "train": split["train"],
    "validation": split["test"]
})

Generating train split: 0 examples [00:00, ? examples/s]

Casting the dataset:   0%|          | 0/1483 [00:00<?, ? examples/s]

In [None]:
# === C. Tokenisasi ===
model_name = "indobenchmark/indobert-lite-base-p2"
tokenizer = BertTokenizer.from_pretrained(model_name)

def preprocess_fn(examples):
    return tokenizer(
        examples["text_final"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Terapkan tokenisasi pada kedua split
tokenized_ds = dataset.map(
    preprocess_fn,
    batched=True,
    remove_columns=["text_final"]
)

# Ubah nama kolom label menjadi "labels"
tokenized_ds = tokenized_ds.rename_column("Status_Normalized", "labels")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Map:   0%|          | 0/1334 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

In [None]:
# === D. Set Format ke TensorFlow ===
# Pastikan kolom: input_ids, attention_mask, labels (integer)
tokenized_ds.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])

# === E. Konversi ke tf.data.Dataset dengan Casting Label ke tf.int32 ===
def to_tf_dataset(split_name):
    features = {
        "input_ids": tokenized_ds[split_name]["input_ids"],
        "attention_mask": tokenized_ds[split_name]["attention_mask"]
    }
    labels = tf.cast(tokenized_ds[split_name]["labels"], tf.int32)
    return tf.data.Dataset.from_tensor_slices((features, labels))

batch_size = 16
train_ds = (
    to_tf_dataset("train")
    .shuffle(1000, seed=42)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)
valid_ds = (
    to_tf_dataset("validation")
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
from transformers import create_optimizer

# === F. Muat, Compile, dan Fine‑Tune Model ===
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  # jumlah kelas sesuai mapping ClassLabel
)

# Hitung jumlah langkah training
steps_per_epoch = len(train_ds)
num_train_steps = steps_per_epoch * 3  # total epoch

# Buat optimizer & scheduler
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=0,
    weight_decay_rate=0.01
)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
model.fit(train_ds, validation_data=valid_ds, epochs=3)

tf_model.h5:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

Some layers from the model checkpoint at indobenchmark/indobert-lite-base-p2 were not used when initializing TFAlbertForSequenceClassification: ['sop_classifier']
- This IS expected if you are initializing TFAlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-lite-base-p2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7d501d160510>

In [None]:
!pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.22.0-py3-none-any.whl.metadata (3.2 kB)
Collecting packaging~=23.1 (from tensorflowjs)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading tensorflowjs-4.22.0-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, tensorflowjs
  Attempting uninstall: packaging
    Found existing installation: packaging 24.2
    Uninstalling packaging-24.2:
      Successfully uninstalled packaging-24.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-bigquery 3.33.0 requires packa

In [None]:
# === G. Simpan Model ke Berbagai Format ===
# SavedModel
saved_model_dir = "./indobert_tf_savedmodel"
model.save(saved_model_dir, save_format="tf")

# HDF5
h5_path = "./indobert_tf_model.h5"
model.save_weights(h5_path, save_format="h5")

# TF Lite
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
tflite_model = converter.convert()
with open("./indobert_model.tflite", "wb") as f:
    f.write(tflite_model)

# TFJS
!python -m tensorflowjs.converters.converter \
  --input_format=keras \
  ./indobert_tf_model.h5 \
  ./indobert_tfjs_model

2025-05-30 06:51:53.329826: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748587913.349464    2775 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748587913.355350    2775 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[32m🌲 Try [0m[34mhttps://ydf.readthedocs.io[0m[32m, the successor of TensorFlow Decision Forests with more features and faster training![0m


In [None]:
# Compress the folders into zip files
!zip -r /content/indobert_tf_savedmodel.zip ./indobert_tf_savedmodel
!zip -r /content/indobert_tfjs_model.zip ./indobert_tfjs_model

# Download the zip files
from google.colab import files
files.download('/content/indobert_tf_savedmodel.zip')
files.download('/content/indobert_tfjs_model.zip')

  adding: indobert_tf_savedmodel/ (stored 0%)
  adding: indobert_tf_savedmodel/keras_metadata.pb (deflated 89%)
  adding: indobert_tf_savedmodel/variables/ (stored 0%)
  adding: indobert_tf_savedmodel/variables/variables.data-00000-of-00001 (deflated 18%)
  adding: indobert_tf_savedmodel/variables/variables.index (deflated 71%)
  adding: indobert_tf_savedmodel/fingerprint.pb (stored 0%)
  adding: indobert_tf_savedmodel/saved_model.pb (deflated 94%)
  adding: indobert_tf_savedmodel/assets/ (stored 0%)
  adding: indobert_tfjs_model/ (stored 0%)
  adding: indobert_tfjs_model/group1-shard6of12.bin (deflated 7%)
  adding: indobert_tfjs_model/group1-shard2of12.bin (deflated 7%)
  adding: indobert_tfjs_model/group1-shard8of12.bin (deflated 7%)
  adding: indobert_tfjs_model/group1-shard10of12.bin (deflated 7%)
  adding: indobert_tfjs_model/model.json (deflated 89%)
  adding: indobert_tfjs_model/group1-shard5of12.bin (deflated 7%)
  adding: indobert_tfjs_model/group1-shard1of12.bin (deflated 7%

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Download the HDF5 and TFLite files
files.download(h5_path)
files.download("./indobert_model.tflite")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Inference**

In [None]:
import tensorflow as tf
import numpy as np
from scipy.special import softmax
from transformers import BertTokenizer
import time

# Load model dan tokenizer
saved_model_dir = "./indobert_tf_savedmodel"
model = tf.keras.models.load_model(saved_model_dir)
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-lite-base-p2")

# Label map
label_map = {0: "HOAX", 1: "FAKTA"}

# Tulis input paragraf langsung
input_text = """
Kepolisian Daerah (Polda) Metro Jaya membongkar paksa bangunan yang didirikan organisasi
masyarakat (ormas) Gerakan Rakyat Indonesia Bersatu (GRIB) Jaya di Pondok Betung, Pondok Aren,
Tangerang Selatan, Banten. Lahan itu sebelumnya diklaim Badan Meteorologi, Klimatologi, Geofisika
(BMKG) sebagai milik mereka.
"""
texts = [input_text]

# Tokenisasi
inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="tf"
)

# Inference
start = time.time()
output = model(inputs)
logits = output["logits"].numpy()
probs = softmax(logits, axis=1)[0]  # ambil hasil dari satu input
pred_label = label_map[np.argmax(probs)]
confidence = np.max(probs)
end = time.time()

# Custom label dengan threshold
threshold = 0.3
final_label = "Belum Terverifikasi" if confidence < threshold else pred_label

# Output hasil
print("\n📝 Teks:")
print(input_text)
print("\n📊 Probabilitas:")
for i, label in label_map.items():
    print(f"- {label}: {probs[i]:.4f}")
print(f"\n✅ Label Prediksi: {pred_label}")
print(f"🎯 Confidence Tertinggi: {confidence:.4f}")
print(f"🔖 Label Final (threshold {threshold}): {final_label}")
print(f"⏱️ Waktu Inference: {end - start:.2f} detik")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.



📝 Teks:

Kepolisian Daerah (Polda) Metro Jaya membongkar paksa bangunan yang didirikan organisasi 
masyarakat (ormas) Gerakan Rakyat Indonesia Bersatu (GRIB) Jaya di Pondok Betung, Pondok Aren, 
Tangerang Selatan, Banten. Lahan itu sebelumnya diklaim Badan Meteorologi, Klimatologi, Geofisika 
(BMKG) sebagai milik mereka.


📊 Probabilitas:
- HOAX: 0.0030
- FAKTA: 0.9970

✅ Label Prediksi: FAKTA
🎯 Confidence Tertinggi: 0.9970
🔖 Label Final (threshold 0.3): FAKTA
⏱️ Waktu Inference: 0.95 detik


In [None]:
df_test.to_excel("result.xlsx", index=False)

NameError: name 'df_test' is not defined