In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt

# Konfigurasi TensorFlow agar tidak ada warning yang tidak perlu
tf.get_logger().setLevel('ERROR')

# ==== 1️⃣ Load Data ====
# Asumsi path file adalah '../data/dataset_clean.csv' seperti yang ada di kode awal
try:
    # Ganti path ini jika file berada di lokasi lain
    df = pd.read_csv("../data/dataset_clean.csv")
except FileNotFoundError:
    # Membuat dummy data jika file tidak ditemukan, agar kode tetap bisa dijalankan untuk demonstrasi
    print("WARNING: 'dataset_clean.csv' not found. Creating dummy data for demonstration.")
    data = {
        'clean_tweet': [
            "pelayanan sangat buruk sekali", "produk ini lumayan lah", "sangat bagus sekali puas",
            "netral saja tidak ada komentar", "parah banget kecewa", "ini oke punya"
        ] * 300,
        'sentimen': (['negatif'] * 300) + (['netral'] * 300) + (['positif'] * 300)
    }
    df = pd.DataFrame(data)

print(f"Loaded dataset: {df.shape}")
print(df['sentimen'].value_counts())

# ==== 2️⃣ Encode Label ====
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentimen'])
classes = label_encoder.classes_
print("Classes:", classes)

# ==== 3️⃣ Split Data ====
# Menggunakan stratify untuk memastikan pembagian sentimen seimbang di setiap set
X_train, X_temp, y_train, y_temp = train_test_split(
    df['clean_tweet'], df['label'], test_size=0.3, random_state=42, stratify=df['sentimen']
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
print(f"Sizes -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

# ==== 4️⃣ Tokenize ====
MAX_LEN = 60
VOCAB_SIZE = 5000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Pad sequences
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN, padding='post')
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN, padding='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN, padding='post')

# ==== 5️⃣ Embedding Matrix (Random/Scratch) ====
EMBEDDING_DIM = 300
# Matrix diinisialisasi secara acak (karena FastText tidak ditemukan)
embedding_matrix = np.random.uniform(-0.05, 0.05, (VOCAB_SIZE, EMBEDDING_DIM))
print("No pretrained FastText found; training embeddings from scratch (fallback).")

# ==== 6️⃣ Build Model (FIXED REGULARIZATION) ====
def build_bilstm(vocab_size, embedding_dim, embedding_matrix, trainable_embed=False):
    """Membangun arsitektur Bi-LSTM dengan penambahan regularisasi (Dropout)."""
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],
                  input_length=MAX_LEN, trainable=trainable_embed, name='embedding_layer'),

        # Meningkatkan recurrent_dropout untuk mengatasi overfitting pada LSTM
        Bidirectional(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=False), name='bidirectional_lstm'),

        # Dropout yang lebih agresif (dari 0.5 ke 0.6 atau 0.5) setelah Bi-LSTM
        Dropout(0.5, name='dropout_1'),

        Dense(64, activation='relu', name='dense_hidden'),

        # Tambahan Dropout untuk hidden layer
        Dropout(0.5, name='dropout_2'),

        Dense(len(classes), activation='softmax', name='output_layer')
    ])
    return model

# Buat model dan tampilkan summary
model = build_bilstm(VOCAB_SIZE, EMBEDDING_DIM, embedding_matrix)
model.build(input_shape=(None, MAX_LEN))
print("\nModel Architecture:")
model.summary()

# ==== 7️⃣ Compile ====
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy'])

# ==== 8️⃣ Callbacks (IMPROVED STOPPING) ====
os.makedirs("../models", exist_ok=True)
callbacks = [
    # EARLY STOPPING: Hentikan training jika val_loss tidak membaik setelah 5 epoch,
    # dan gunakan bobot terbaik (yang menghasilkan val_loss terendah)
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),

    # Simpan model terbaik berdasarkan val_loss
    ModelCheckpoint("../models/lstm_fasttext_best.keras", monitor='val_loss', save_best_only=True, verbose=0),

    # Kurangi Learning Rate jika val_loss tidak membaik setelah 2 epoch
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, min_lr=1e-6)
]

# ==== 9️⃣ Train (Increased Epochs to allow EarlyStopping to work) ====
print("\nStarting Training...")
history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    # Atur epochs tinggi (misal 50) tapi biarkan EarlyStopping yang menentukan kapan berhenti
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=2 # Tampilkan output per epoch
)
print("Training finished.")

# ==== 🔟 Evaluate ====
# Load bobot terbaik yang disimpan oleh ModelCheckpoint/EarlyStopping
try:
    model.load_weights("../models/lstm_fasttext_best.keras")
    print("\nLoaded best model weights for evaluation.")
except Exception as e:
    print(f"\nCould not load best weights (Maybe directory error): {e}")

y_pred = np.argmax(model.predict(X_test_seq, verbose=0), axis=1)

print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred, target_names=classes))

# ==== 📊 Plot Curves ====
plt.figure(figsize=(12, 5))

# Plot Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()




ValueError: All arrays must be of the same length