In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import re
from nltk.corpus import wordnet
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
import nltk

file_path = '/kaggle/input/mergeddataset/merged_dataset_fix_updated.csv'
dataset = pd.read_csv(file_path)

dataset.dropna(inplace=True)

def clean_text(text):
    text = re.sub(r'\[USERNAME\]', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = text.lower()  
    text = text.strip()  
    return text

dataset['cleaned_text'] = dataset['text'].apply(clean_text)
dataset.head()

Unnamed: 0,text,label,cleaned_text
0,Saya merasa bahwa itu menciptakan lingkungan y...,fear,saya merasa bahwa itu menciptakan lingkungan y...
1,Saya merasa enggan meminta apapun,fear,saya merasa enggan meminta apapun
2,Saya takut untuk benar -benar menunjukkan apa ...,fear,saya takut untuk benar benar menunjukkan apa y...
3,Saya pikir dia merasa sedikit tidak berdaya da...,fear,saya pikir dia merasa sedikit tidak berdaya da...
4,Saya tentu merasa tersiksa,fear,saya tentu merasa tersiksa


In [6]:
nltk.download('stopwords')

print("Distribusi awal label:")
print(dataset['label'].value_counts())

vectorizer = TfidfVectorizer(stop_words=stopwords.words('indonesian'))
X = vectorizer.fit_transform(dataset['text'])
y = dataset['label']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_resampled_texts = vectorizer.inverse_transform(X_resampled)
X_resampled_texts = [' '.join(text) for text in X_resampled_texts]

df_resampled = pd.DataFrame({'text': X_resampled_texts, 'label': y_resampled})

print("Distribusi label setelah augmentasi:")
print(df_resampled['label'].value_counts())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Distribusi awal label:
label
anger       6101
fear        6049
love        6037
happy       6017
surprise    6000
sadness     5997
Name: count, dtype: int64




Distribusi label setelah augmentasi:
label
fear        6101
sadness     6101
love        6101
happy       6101
surprise    6101
anger       6101
Name: count, dtype: int64


In [7]:
label_encoder = LabelEncoder()
df_resampled['label'] = label_encoder.fit_transform(df_resampled['label'])

X_train, X_test, y_train, y_test = train_test_split(df_resampled['text'], df_resampled['label'], test_size=0.2, random_state=42)

model_name = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

train_encodings = encode_texts(X_train, tokenizer)
test_encodings = encode_texts(X_test, tokenizer)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.values
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.values
)).batch(16)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(train_dataset,
                    epochs=5,
                    validation_data=test_dataset,
                    callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score

# Predict on the test dataset
y_pred_probs = model.predict(test_dataset).logits
y_pred = tf.argmax(y_pred_probs, axis=1).numpy()

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("Classification Report:")
print(class_report)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sensitivity (Recall): {recall:.4f}")

Confusion Matrix:
[[1133   44   18    1   38    5]
 [  28 1096   28    5   12   63]
 [  48   57 1046   69   41   19]
 [  12   13   28 1110   28    3]
 [  66   72   60   24  965   15]
 [   7   31   37    2   14 1084]]
Classification Report:
              precision    recall  f1-score   support

       anger       0.88      0.91      0.89      1239
        fear       0.83      0.89      0.86      1232
       happy       0.86      0.82      0.84      1280
        love       0.92      0.93      0.92      1194
     sadness       0.88      0.80      0.84      1202
    surprise       0.91      0.92      0.92      1175

    accuracy                           0.88      7322
   macro avg       0.88      0.88      0.88      7322
weighted avg       0.88      0.88      0.88      7322

Accuracy: 0.8787
F1 Score: 0.8782
Precision: 0.8789
Sensitivity (Recall): 0.8787


In [20]:
import os

saved_model_dir = './saved_model'
model.save_pretrained(saved_model_dir)
tokenizer.save_pretrained(saved_model_dir)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [22]:
import os
import tensorflow as tf

model_tflite_path = "/kaggle/working/model.tflite"

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open(model_tflite_path, "wb") as f:
    f.write(tflite_model)

from IPython.display import FileLink
FileLink(model_tflite_path)

Summary on the non-converted ops:
---------------------------------
 * Accepted dialects: tfl, builtin, func
 * Non-Converted Ops: 222, Total Ops 1337, % non-converted = 16.60 %
 * 222 ARITH ops

- arith.constant:  222 occurrences  (f32: 206, i32: 16)



  (f32: 172)
  (f32: 24)
  (f32: 1)
  (i32: 48)
  (i32: 1)
  (f32: 74)
  (f32: 3, i32: 96)
  (f32: 12)
  (f32: 50)
  (f32: 88)

  (i32: 73)
  (i32: 1)
  (i32: 96)
  (f32: 168, i32: 1)
  (f32: 25)
  (i32: 50)
  (f32: 12)
  (f32: 25)
  (f32: 1, i32: 15)
  (f32: 26)
  (f32: 1)
  (f32: 48)
