In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report



In [4]:
# Load data
df = pd.read_csv("top123-2.csv")

In [7]:
# Ubah kolom emosi jadi list label
df['top_emotions'] = df[['top_1', 'top_2', 'top_3']].values.tolist()
df['top_emotions'] = df['top_emotions'].apply(lambda x: list(filter(None, x)))  # hapus None

# Tokenisasi teks
MAX_NUM_WORDS = 10000
MAX_SEQ_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='<OOV>')
df['cleaned_statement'] = df['cleaned_statement'].astype(str)
tokenizer.fit_on_texts(df['cleaned_statement'])
sequences = tokenizer.texts_to_sequences(df['cleaned_statement'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')


In [8]:
# Pastikan list emosi tidak mengandung NaN/None dan semua elemennya string
def clean_emotion_list(emotion_list):
    if not isinstance(emotion_list, list):
        return []
    return [str(emotion) for emotion in emotion_list if pd.notnull(emotion)]

df['top_emotions'] = df['top_emotions'].apply(clean_emotion_list)


In [9]:

# MultiLabel Binarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['top_emotions'])

In [10]:
# Split data
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
# Model
model = Sequential()
model.add(Embedding(input_dim=MAX_NUM_WORDS, output_dim=128, input_length=MAX_SEQ_LENGTH))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))  # sigmoid karena multi-label

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])

model.summary()




In [13]:
# Train
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)



Epoch 1/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.5636 - loss: 0.4573 - val_accuracy: 0.6320 - val_loss: 0.2578
Epoch 2/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 85ms/step - accuracy: 0.6502 - loss: 0.2569 - val_accuracy: 0.6437 - val_loss: 0.2309
Epoch 3/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 83ms/step - accuracy: 0.6727 - loss: 0.2132 - val_accuracy: 0.7340 - val_loss: 0.2081
Epoch 4/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 83ms/step - accuracy: 0.7078 - loss: 0.1790 - val_accuracy: 0.7388 - val_loss: 0.1890
Epoch 5/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 84ms/step - accuracy: 0.7098 - loss: 0.1603 - val_accuracy: 0.7311 - val_loss: 0.1847
Epoch 6/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 83ms/step - accuracy: 0.7159 - loss: 0.1416 - val_accuracy: 0.7447 - val_loss: 0.1786
Epoch 7/10
[1m1

In [14]:
# Evaluate
y_pred = model.predict(X_test)
y_pred_bin = (y_pred > 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred_bin, target_names=mlb.classes_))


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
Classification Report:
              precision    recall  f1-score   support

     anxiety       0.73      0.34      0.47        32
        fear       0.82      0.86      0.84       266
 nervousness       0.90      0.94      0.92       368
     neutral       0.95      0.95      0.95       568
     sadness       0.88      0.73      0.80       233
       shame       0.82      0.91      0.86       204
   suffering       0.73      0.52      0.60       120

   micro avg       0.88      0.86      0.87      1791
   macro avg       0.83      0.75      0.78      1791
weighted avg       0.88      0.86      0.87      1791
 samples avg       0.89      0.88      0.88      1791



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
def predict_top3_emotions(text, tokenizer, model, mlb, max_len=100):
    # Lowercase & konversi teks
    text = str(text).lower()

    # Tokenisasi & padding
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    # Prediksi
    pred = model.predict(padded)[0]

    # Ambil 3 skor tertinggi
    top3_indices = np.argsort(pred)[-3:][::-1]
    top3_labels = [mlb.classes_[i] for i in top3_indices]
    top3_scores = [round(pred[i], 4) for i in top3_indices]

    return list(zip(top3_labels, top3_scores))  # Hasil: [(label1, score1), ...]


In [36]:
new_text = "wake feel discomfort body wake every day discomfort hand calve foot really pain unpleasant difficult explain anyone ever sensation"
top3 = predict_top3_emotions(new_text, tokenizer, model, mlb)

print("Teks:", new_text)
print("Top 3 Emosi dan Skor:")
for label, score in top3:
    print(f"- {label}: {score}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Teks: wake feel discomfort body wake every day discomfort hand calve foot really pain unpleasant difficult explain anyone ever sensation
Top 3 Emosi dan Skor:
- nervousness: 0.9909999966621399
- fear: 0.983299970626831
- sadness: 0.9301000237464905
