In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

file_path = '/kaggle/input/emotions/emotions.csv'
dataset = pd.read_csv(file_path)

dataset.dropna(inplace=True)

def clean_text(text):
    text = re.sub(r'\[USERNAME\]', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = text.strip()
    return text

dataset['cleaned_text'] = dataset['text'].apply(clean_text)

label_encoder = LabelEncoder()
dataset['label'] = label_encoder.fit_transform(dataset['label'])

print("Sample counts per class before under-sampling:")
print(dataset['label'].value_counts())

def under_sampling(df, label_col):
    min_count = df[label_col].value_counts().min()
    sampled_df = df.groupby(label_col).apply(lambda x: x.sample(min_count)).reset_index(drop=True)
    return sampled_df

dataset = under_sampling(dataset, 'label')

print("\nSample counts per class after under-sampling:")
print(dataset['label'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(dataset['cleaned_text'], dataset['label'], test_size=0.3, random_state=42, stratify=dataset['label'])

2024-05-31 08:34:55.500905: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-31 08:34:55.501047: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-31 08:34:55.620426: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Sample counts per class before under-sampling:
label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

Sample counts per class after under-sampling:
label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64


  sampled_df = df.groupby(label_col).apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [2]:
model_name = "bert-base-multilingual-cased" 
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

train_encodings = encode_texts(X_train, tokenizer)
test_encodings = encode_texts(X_test, tokenizer)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.values
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.values
)).batch(16)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

history = model.fit(train_dataset,
                    epochs=1,
                    validation_data=test_dataset)

Cause: for/else statement not yet supported


I0000 00:00:1717144656.539351     107 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [4]:
pip install googletrans==4.0.0-rc1

  pid, fd = os.forkpty()


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.5.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [5]:
from googletrans import Translator

emotion_labels = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

def translate_texts(texts, src_lang='id', dest_lang='en'):
    translator = Translator()
    translated_texts = []
    for text in texts:
        translation = translator.translate(text, src=src_lang, dest=dest_lang)
        translated_texts.append(translation.text)
        print(f'Translated Text: {translation.text}')  # Menampilkan hasil terjemahan
    return translated_texts

def predict_new_texts(model, tokenizer, texts, label_encoder, max_len=128):
    translated_texts = translate_texts(texts)
    clean_texts = [clean_text(text) for text in translated_texts]

    inputs = tokenizer(clean_texts, max_length=max_len, truncation=True, padding='max_length', return_tensors='tf')

    pred_prob = model.predict(inputs)
    pred_classes = np.argmax(pred_prob.logits, axis=1)
    
    labels = [emotion_labels[pred] for pred in pred_classes]
    return labels

new_texts = [
    "Saya merasa sangat sedih dan kesepian akhir-akhir ini, tidak tahu harus berbuat apa.",
    "Saya sangat marah dan kecewa dengan teman-teman saya.",
    "Hari ini saya merasa sangat bahagia dan penuh semangat!",
    "Saya merasa takut dan cemas setiap kali memikirkan masa depan.",
    "Saya merasa sangat bangga dengan pencapaian saya hari ini.",
    "Saya merasa hampa dan tidak termotivasi untuk melakukan apa pun.",
    "Saya merasa begitu dicintai dan diperhatikan oleh keluarga saya.",
    "Saya merasa kesal dengan situasi di tempat kerja saya.",
    "Saya merasa penuh harapan dan optimis tentang peluang baru ini.",
    "Saya merasa bersalah dan menyesal tentang kesalahan yang saya buat."
]

predicted_labels = predict_new_texts(model, tokenizer, new_texts, label_encoder)
for text, label in zip(new_texts, predicted_labels):
    print(f'Text: {text}\nPredicted Label: {label}\n')


Translated Text: I feel very sad and lonely lately, don't know what to do.
Translated Text: I am very angry and disappointed with my friends.
Translated Text: Today I feel very happy and full of enthusiasm!
Translated Text: I feel scared and anxious every time I think of the future.
Translated Text: I feel very proud of my achievements today.
Translated Text: I feel empty and not motivated to do anything.
Translated Text: I feel so loved and cared for by my family.
Translated Text: I was annoyed with the situation at my workplace.
Translated Text: I feel full and optimistic about this new opportunity.
Translated Text: I feel guilty and sorry about the mistakes I made.
Text: Saya merasa sangat sedih dan kesepian akhir-akhir ini, tidak tahu harus berbuat apa.
Predicted Label: sadness

Text: Saya sangat marah dan kecewa dengan teman-teman saya.
Predicted Label: anger

Text: Hari ini saya merasa sangat bahagia dan penuh semangat!
Predicted Label: joy

Text: Saya merasa takut dan cemas seti

In [8]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(test_dataset)
y_pred_labels = np.argmax(y_pred.logits, axis=1)

cm = confusion_matrix(y_test, y_pred_labels)

target_names = dataset['label'].unique().astype(str).tolist()

evaluation_report = classification_report(y_test, y_pred_labels, target_names=target_names)

f1 = f1_score(y_test, y_pred_labels, average='weighted')

precision = precision_score(y_test, y_pred_labels, average='weighted')
recall = recall_score(y_test, y_pred_labels, average='weighted')

accuracy = accuracy_score(y_test, y_pred_labels)

print("Confusion Matrix:")
print(cm)

print("\nEvaluation Report:")
print(evaluation_report)

print("\nF1 Score:", f1)
print("Precision:", precision)
print("Sensitivity (Recall):", recall)
print("Accuracy:", accuracy)

Confusion Matrix:
[[4269    8    1  132   78    3]
 [  11 4036  357   26   20   41]
 [   3    2 4483    2    1    1]
 [  18   10    8 4264  192    0]
 [  23    5    2   40 4032  390]
 [   2    5    0    1    6 4478]]

Evaluation Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      4491
           1       0.99      0.90      0.94      4491
           2       0.92      1.00      0.96      4492
           3       0.95      0.95      0.95      4492
           4       0.93      0.90      0.91      4492
           5       0.91      1.00      0.95      4492

    accuracy                           0.95     26950
   macro avg       0.95      0.95      0.95     26950
weighted avg       0.95      0.95      0.95     26950


F1 Score: 0.9483115710294252
Precision: 0.9502338158910858
Sensitivity (Recall): 0.9484972170686456
Accuracy: 0.9484972170686456
