In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers.optimization_tf import AdamWeightDecay # ADDED: AdamW recommended for BERT fine-tuning
import matplotlib.pyplot as plt

# Konfigurasi TensorFlow
tf.get_logger().setLevel('ERROR')

# ==== IndoBERT Model Configuration ====
# Kita gunakan IndoBERT base yang umum diakses publik
# FIX: Kembali ke indobenchmark/indobert-base-p1
MODEL_NAME = "indobenchmark/indobert-base-p1"
MAX_LEN = 60 # Panjang sequence tetap 60
NUM_LABELS = 3

# ==== 1️⃣ Load Data ====
try:
    # Ganti path ini jika file berada di lokasi lain
    df = pd.read_csv("../data/dataset_clean.csv")
except FileNotFoundError:
    print("WARNING: 'dataset_clean.csv' not found. Creating dummy data.")
    data = {
        'clean_tweet': [
            "pelayanan sangat buruk sekali", "produk ini lumayan lah", "sangat bagus sekali puas",
            "netral saja tidak ada komentar", "parah banget kecewa", "ini oke punya"
        ] * 300,
        'sentimen': (['negatif'] * 300) + (['netral'] * 300) + (['positif'] * 300)
    }
    df = pd.DataFrame(data)

print(f"Loaded dataset: {df.shape}")
print(df['sentimen'].value_counts())

# ==== 2️⃣ Encode Label ====
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentimen'])
classes = label_encoder.classes_
print("Classes:", classes)

# ==== 3️⃣ Split Data ====
X_train, X_temp, y_train, y_temp = train_test_split(
    df['clean_tweet'], df['label'], test_size=0.3, random_state=42, stratify=df['sentimen']
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
# Konversi y ke numpy array (wajib untuk TF)
y_train, y_val, y_test = y_train.values, y_val.values, y_test.values

print(f"Sizes -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

# ==== 4️⃣ Tokenize menggunakan IndoBERT Tokenizer ====
# IndoBERT menggunakan tokenizer khusus
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_data(texts):
    """Tokenisasi teks menggunakan tokenizer IndoBERT."""
    return tokenizer(
        texts.tolist(),
        max_length=MAX_LEN,
        truncation=True,
        padding='max_length',
        return_tensors='tf' # Mengembalikan tensor TensorFlow
    )

# Tokenisasi semua set data
X_train_enc = tokenize_data(X_train)
X_val_enc = tokenize_data(X_val)
X_test_enc = tokenize_data(X_test)

print("\nData Tokenization done for IndoBERT.")

# ==== 5️⃣ Load Model IndoBERT untuk Klasifikasi (FIXED) ====
# TFAutoModelForSequenceClassification otomatis menambahkan lapisan output (Dense) di atas BERT
model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_LABELS,
    # from_pt=True dihapus untuk menghindari error kerentanan PyTorch
)

# Tampilkan ringkasan model (ini akan jauh lebih besar!)
print("\nIndoBERT Model Architecture:")
model.summary()

# ==== 6️⃣ Compile ====
# Optimizer yang disarankan untuk fine-tuning BERT (AdamW)
# Menggunakan AdamWeightDecay dari transformers untuk menghindari error serialisasi Keras/TF
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) # AdamW
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Gunakan from_logits=True karena model BERT tidak memiliki Softmax
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# ==== 7️⃣ Callbacks (DIHAPUS UNTUK MEMPERBAIKI ERROR) ====
# Konflik internal antara transformers dan tf.keras.callbacks dihindari dengan menghapus semua callbacks.
# Kita akan menggunakan epoch yang sangat sedikit (5) untuk mencegah overfitting, yang merupakan praktik standar BERT fine-tuning.

# ==== 8️⃣ Train (Fine-Tuning) ====
print("\nStarting IndoBERT Fine-Tuning...")
# Input ke model adalah dictionary (input_ids, attention_mask)
history = model.fit(
    X_train_enc, y_train,
    validation_data=(X_val_enc, y_val),
    epochs=5, # Dikurangi menjadi 5 epochs untuk mencegah overfitting tanpa EarlyStopping
    batch_size=32,
    verbose=2
)
print("Fine-Tuning finished.")

# ==== 9️⃣ Evaluate ====
# Evaluasi menggunakan bobot terakhir setelah 5 epoch.

# Prediksi: output model BERT adalah logits, harus diubah ke probabilitas (softmax)
# dan kemudian di-argmax untuk mendapatkan label
logits = model.predict(X_test_enc, verbose=0).logits
y_pred = np.argmax(logits, axis=1)

print("\nClassification Report (Test Data - IndoBERT):")
print(classification_report(y_test, y_pred, target_names=classes))

# ==== 📊 Plot Curves ====
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs (IndoBERT)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs (IndoBERT)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()


Loaded dataset: (1815, 3)
sentimen
positif    612
netral     607
negatif    596
Name: count, dtype: int64
Classes: ['negatif' 'netral' 'positif']
Sizes -> train: (1270,), val: (272,), test: (273,)

Data Tokenization done for IndoBERT.


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



IndoBERT Model Architecture:
Model: "tf_bert_for_sequence_classification_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  124441344 
                                                                 
 dropout_303 (Dropout)       multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 124443651 (474.71 MB)
Trainable params: 124443651 (474.71 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

Starting IndoBERT Fine-Tuning...


AttributeError: 'EarlyStopping' object has no attribute '_implements_train_batch_hooks'