In [93]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.metrics import f1_score, classification_report
import time
import pandas as pd
import os

print(f"TensorFlow Version: {tf.__version__}")

TensorFlow Version: 2.17.0


In [94]:
# Config & parameters

VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100  
EMBEDDING_DIM = 128
LSTM_UNITS = 64
DROPOUT_RATE = 0.3
BATCH_SIZE = 64
EPOCHS = 5

In [95]:
DATASET_BASE_DIR = (os.path.join(os.getcwd(), "../../dataset/nusax-sentiment/"))

print(f"Dataset base directory: {DATASET_BASE_DIR}")

train_dataset = pd.read_csv(os.path.join(DATASET_BASE_DIR, "train.csv"))

valid_dataset = pd.read_csv(os.path.join(DATASET_BASE_DIR, "valid.csv"))

test_dataset = pd.read_csv(os.path.join(DATASET_BASE_DIR, "test.csv"))

Dataset base directory: c:\ITB\semester_6\ml\tubes_2\Tubes-ML-2\src\lstm\keras\../../dataset/nusax-sentiment/


In [96]:

train_texts = train_dataset['text'].tolist()
train_labels = train_dataset['label'].tolist()


val_texts = valid_dataset['text'].tolist()
val_labels = valid_dataset['label'].tolist()

test_texts = test_dataset['text'].tolist()
test_labels = test_dataset['label'].tolist()

print(f"\nTraining samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")


Training samples: 500
Validation samples: 100
Test samples: 400


In [97]:
print("--- Creating and Adapting TextVectorization Layer ---")
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_sequence_length=MAX_SEQUENCE_LENGTH,
    name="text_vectorization"
)
# Adapt the layer to our training vocabulary
vectorizer.adapt(train_texts)

# --- Test the vectorizer ---
print("\nOriginal sentence:")
print(train_texts[0])
print("\nVectorized sentence:")
print(vectorizer([train_texts[0]]))

--- Creating and Adapting TextVectorization Layer ---

Original sentence:
Nikmati cicilan 0% hingga 12 bulan untuk pemesanan tiket pesawat air asia dengan kartu kredit bni!

Vectorized sentence:
tf.Tensor(
[[1758 1080 1145  196 2834  198   11  607  177  847  283  547    7  507
   496  415    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]], shape=(1, 100), dtype=int64)


In [98]:
def create_tf_dataset(texts, labels):
    return tf.data.Dataset.from_tensor_slices((texts, labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_ds = create_tf_dataset(train_texts, train_labels)
val_ds = create_tf_dataset(val_texts, val_labels)
test_ds = create_tf_dataset(test_texts, test_labels)

In [99]:
def build_model(is_bidirectional, units=LSTM_UNITS):
    """Builds either a Unidirectional or Bidirectional LSTM model."""
    
    inputs = tf.keras.Input(shape=(1,), dtype=tf.string, name="input_text")
    x = vectorizer(inputs)
    x = tf.keras.layers.Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=EMBEDDING_DIM,
        name="embedding"
    )(x)

    if is_bidirectional:
        x = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(units, name="lstm_layer"),
            name="bidirectional_lstm"
        )(x)
    else:
        x = tf.keras.layers.LSTM(units, name="lstm_layer")(x)
        
    x = tf.keras.layers.Dropout(DROPOUT_RATE, name="dropout")(x)
    outputs = tf.keras.layers.Dense(3, activation="softmax", name="output")(x)
    
    model_type = "Bidirectional" if is_bidirectional else "Unidirectional"
    model = tf.keras.Model(inputs, outputs, name=f"{model_type}_LSTM_Model")
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    
    return model


In [100]:
uni_model = build_model(is_bidirectional=False)
bi_model = build_model(is_bidirectional=True)

print("--- Unidirectional Model Summary ---")
uni_model.summary()

print("\n" + "="*50 + "\n")

print("--- Bidirectional Model Summary ---")
bi_model.summary()

--- Unidirectional Model Summary ---




--- Bidirectional Model Summary ---


In [101]:
from tensorflow.python.framework.tensor_util import make_tensor_proto

In [102]:
print("--- Training Unidirectional Model ---")
start_time = time.time()
uni_history = uni_model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
uni_train_time = time.time() - start_time
print(f"\nTraining finished in {uni_train_time:.2f} seconds.")

--- Training Unidirectional Model ---
Epoch 1/5


OverflowError: Python int too large to convert to C long