In [None]:
import transformers
from transformers import TFXLNetModel, XLMTokenizer
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

# Choose model and tokenizer (English XLNet for this example)
model_name = "xlnet-base-cased"
tokenizer = XLMTokenizer.from_pretrained(model_name)

# Load and preprocess data
train_texts, train_labels = ...  # Load your data (clean and tokenize)
test_texts, test_labels = ...  # Load your test data (clean and tokenize)

# Convert text to token IDs
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True)
test_encodings = tokenizer(test_texts, padding="max_length", truncation=True)

# Define model inputs
input_ids = Input(shape=(train_encodings["input_ids"].shape[1],), dtype=tf.int32, name="input_ids")

# Pass input through pre-trained XLNet model (freeze base layers for initial training)
outputs = TFXLNetModel.from_pretrained(model_name)(input_ids, output_hidden_states=True)
last_hidden_state = outputs[0]  # Last hidden state from pre-trained model

# Freeze the base layers of the pre-trained model (optional, adjust based on task complexity)
for layer in model.layers[:12]:  # Adjust number of layers to freeze
    layer.trainable = False

# Add new layers for fine-tuning
x = Dense(128, activation="relu")(last_hidden_state)  # Adjust hidden layer size and activation
output = Dense(1, activation="sigmoid")(x)  # Adjust for your classification task (e.g., binary)

# Create the final fine-tuned model
model = Model(inputs=input_ids, outputs=output)

# Compile the model (optimizer, loss function, metrics)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model on your Indonesian data (split into training and validation sets)
model.fit(
    x=train_encodings["input_ids"],
    y=train_labels,
    validation_split=0.2,  # Adjust validation split ratio
    epochs=3,  # Adjust number of epochs based on dataset and task complexity
)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(test_encodings["input_ids"], test_labels)
print("Test Accuracy:", test_acc)
