In [None]:
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


In [None]:
dataset = pd.read_csv("PD_sentiment_analysis.csv")

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("w11wo/indonesian-roberta-base-sentiment-classifier")

# Tokenize
tokenized_data = tokenizer(dataset["content"].tolist(), return_tensors="tf", padding=True)

# Convert labels
labels = np.array(dataset[" label"])  # Corrected column name

In [None]:
checkpoint = 'w11wo/indonesian-roberta-base-sentiment-classifier'
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)  # Assuming 3 classes

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [None]:
# Compile
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [None]:
# Parameter
num_epochs = 15
batch_size = 32
validation_split = 0.2

# Training
history = model.fit(
    tokenized_data,
    labels,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

# Evaluate the model
loss, accuracy = model.evaluate(tokenized_data, labels)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Save model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 1.1311838626861572, Test Accuracy: 0.4615384638309479


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [None]:
# Load model
fine_tuned_model_path = "./fine_tuned_model"
model = TFAutoModelForSequenceClassification.from_pretrained(fine_tuned_model_path)
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)

# Predict text
text = "argumen yang bapak berikan kurang bagus"

# Tokenize
inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)
outputs = model(inputs)

# Get the predicted class probabilities
logits = outputs.logits
probabilities = tf.nn.softmax(logits, axis=-1).numpy()[0]

# Get the class labels from the config
id2label = {0: "positive", 1: "neutral", 2: "negative"}

# Prepare the output as a list of dictionaries
output_list = [{"label": label, "score": f"{prob:.3f}"} for label, prob in zip(id2label.values(), probabilities)]

# Convert the list to a JSON string
output_json = json.dumps(output_list, indent=4)

# Print the JSON string
print(output_json)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ./fine_tuned_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[
    {
        "label": "positive",
        "score": "0.251"
    },
    {
        "label": "neutral",
        "score": "0.118"
    },
    {
        "label": "negative",
        "score": "0.631"
    }
]


In [None]:
# Save the JSON string to a file in the fine_tuned_model directory
file_path = f"{fine_tuned_model_path}/sentiment_analysis_pred.json"
with open(file_path, "w") as json_file:
    json_file.write(output_json)

print(f"Output saved to {file_path}")

Output saved to ./fine_tuned_model/sentiment_analysis_pred.json
