# 1. Import library

In [15]:
# 1. Import libraries (sama seperti sebelumnya)
from empath import Empath
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Layer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import classification_report, multilabel_confusion_matrix, hamming_loss

In [16]:
# NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inisialisasi
lexicon = Empath()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

custom_stopwords = {'like', 'get', 'go', 'know', 'would', 'could', 'also'}
stop_words.update(custom_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 2. Data Loading

In [17]:
df = pd.read_csv("filtered.csv")
df.head()

Unnamed: 0,statement,label
0,Leaves are also standby in front of the PC ......,Normal
1,"Bismillah for Eid 2021, you get a few question...",Normal
2,I want to spend a lot of time shopping for sna...,Normal
3,"I like to be grateful, don't you think, if you...",Normal
4,why is this person blg parcel hampers blah bla...,Normal


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10299 entries, 0 to 10298
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  10299 non-null  object
 1   label      10299 non-null  object
dtypes: object(2)
memory usage: 161.1+ KB


# 3. Preprocessing

## 3.1. Cleaning n lemmetizing

In [19]:
def clean_text(text):
    text = text.lower()  # Mengubah teks menjadi huruf kecil 
    text = contractions.fix(text)  # Memperbaiki kontraksi 
    text = re.sub(r'http\S+|www\S+', '', text)  # Menghapus URL 
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Menghapus karakter non-ASCII 
    text = re.sub(r'\d+', '', text)  # Menghapus angka 
    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)  # Menghapus tanda baca 
    text = re.sub(r'\s+', ' ', text).strip()  # Ganti multiple spasi dengan 1 spasi)
    return text  # Mengembalikan teks yang sudah dibersihkan

def preprocess_text(text):
    text = clean_text(text) 
    words = text.split()  # Memisahkan teks menjadi list kata 
    processed_words = []  # List untuk menyimpan kata yang sudah diproses
    for word in words:
        if word not in stop_words and len(word) > 2:  # Filter: hapus stopword dan kata dengan panjang ≤ 2
            lemma = lemmatizer.lemmatize(word, pos='v')  # Lemmatisasi sebagai verb (e.g., "running" → "run")
            lemma = lemmatizer.lemmatize(lemma, pos='n')  # Lemmatisasi sebagai noun (e.g., "wolves" → "wolf")
            lemma = lemmatizer.lemmatize(lemma, pos='a')  # Lemmatisasi sebagai adjective (e.g., "better" → "good")
            lemma = lemmatizer.lemmatize(lemma, pos='r')  # Lemmatisasi sebagai adverb (e.g., "quickly" → "quick")
            processed_words.append(lemma)  # Tambahkan kata yang sudah dilematisasi ke list
    return ' '.join(processed_words)  # Gabungkan list kata menjadi teks dengan spasi

df['statement'] = df['statement'].astype(str)
df['cleaned_statement'] = df['statement'].apply(preprocess_text)


## 3.2. Emotion Extraction with Emapth

In [20]:
#Daftar emosi dari Empath yang relevan
emotions = ['anxiety', 'fear', 'nervousness', 'sadness', 'suffering', 'shame']

#Analisis teks menggunakan lexicon Empath, dengan kategori yang telah didefinisikan 
def label_from_empath(text):
    scores = lexicon.analyze(text, categories=emotions, normalize=True)
    return scores


In [21]:
#Mendefinisikan keywords emosi yang tidak terdeteksi Empath
emotion_keywords = [
    'sleep', 'restless', 'panic', 'worried', 'scared',
    'cry', 'sad', 'guilt', 'confused', 'fear',
    'dizzy', 'pressure', 'tired', 'alone', 'anxious', 'hopeless', 'worthless', 'suicidal', 
    'overwhelmed', 'isolated', 'numb', 'empty',
    'heartbroken', 'misery', 'despair', 'regret'
]

#Fungsi cek keywords
def contains_emotion_keyword(text):
    return any(re.search(rf'\b{kw}\b', text.lower()) for kw in emotion_keywords)


In [22]:
#Set fallback
def top_n_emotions_with_fallback(score_dict, text, label, n=3):
    #Jika label normal, fallback ke emosi neutral
    if label.lower() == 'normal':
        return ['neutral']
    
    #Jika bukan dictionary atau jika kosong, langsung fallback ke anxiety
    if not isinstance(score_dict, dict) or not score_dict:
        return ['anxiety']  

    sorted_items = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top = [emotion for emotion, score in sorted_items[:n] if score > 0]

    #Jika tidak ada emosi dengan skor positif, fallback ke anxiety saja
    return top if top else ['anxiety']


In [23]:
#Membuat empath_score dari cleaned_statement
df['empath_scores'] = df['cleaned_statement'].apply(label_from_empath)

#Membuat top_emotion berdasarkan empath_score tertinggi dengan menerapkan fallback jika skor kurang
df['top_emotions'] = df.apply(
    lambda row: top_n_emotions_with_fallback(
        row['empath_scores'], row['cleaned_statement'], row['label']
    ), axis=1
)


# 4. Modeling

In [None]:
#Hapus None dari kolom top_emotion
df['top_emotions'] = df['top_emotions'].apply(lambda x: list(filter(None, x)))  # hapus None
#Tokenisasi teks
MAX_NUM_WORDS = 10000
MAX_SEQ_LENGTH = 100

#Membuat tokenizer untuk mengubah teks menjadi token numerik, dengan handling untuk kata-kata yang tidak ada dalam vocab
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='<OOV>')
df['cleaned_statement'] = df['cleaned_statement'].astype(str)
#Melatih tokenizer pada teks agar bisa mengenali kata-kata dan membuat mapping numerik
tokenizer.fit_on_texts(df['cleaned_statement'])
sequences = tokenizer.texts_to_sequences(df['cleaned_statement'])

#Menambahkan padding agar setiap sequence memiliki panjang yang seragam
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')

In [None]:
#Pastikan list emosi tidak mengandung NaN/None dan semua elemennya string
def clean_emotion_list(emotion_list):
    if not isinstance(emotion_list, list):
        return []
    return [str(emotion) for emotion in emotion_list if pd.notnull(emotion)]

df['top_emotions'] = df['top_emotions'].apply(clean_emotion_list)


In [None]:
#MultiLabel Binarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['top_emotions'])

In [49]:
#Split data
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [51]:
# Model
model = Sequential()
model.add(Embedding(input_dim=MAX_NUM_WORDS, output_dim=128, input_length=MAX_SEQ_LENGTH))
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # LSTM pertama
model.add(Bidirectional(LSTM(32, return_sequences=False)))  # LSTM kedua untuk informasi lebih dalam
model.add(Dropout(0.3))  # Mengurangi dropout agar tidak kehilangan terlalu banyak informasi
model.add(Dense(len(mlb.classes_), activation='sigmoid'))  # Aktivasi sigmoid karena multi-label

# Compile Model
optimizer = Adam(learning_rate=1e-3, decay=1e-5)  # Menggunakan decay untuk stabilitas
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Summary Model
model.summary()




In [52]:
# Train
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=64,
    callbacks=[early_stop]
)


Epoch 1/20
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 162ms/step - accuracy: 0.5459 - loss: 0.4604 - val_accuracy: 0.6291 - val_loss: 0.2664
Epoch 2/20
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 150ms/step - accuracy: 0.6527 - loss: 0.2673 - val_accuracy: 0.6417 - val_loss: 0.2425
Epoch 3/20
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 150ms/step - accuracy: 0.6672 - loss: 0.2346 - val_accuracy: 0.6456 - val_loss: 0.2189
Epoch 4/20
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 150ms/step - accuracy: 0.7114 - loss: 0.2041 - val_accuracy: 0.7437 - val_loss: 0.2033
Epoch 5/20
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 151ms/step - accuracy: 0.7361 - loss: 0.1748 - val_accuracy: 0.8117 - val_loss: 0.1917
Epoch 6/20
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 151ms/step - accuracy: 0.7567 - loss: 0.1593 - val_accuracy: 0.7903 - val_loss: 0.1881
Epoch 7/20

In [53]:
# Evaluate
y_pred = model.predict(X_test)
y_pred_bin = (y_pred > 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred_bin, target_names=mlb.classes_))


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step
Classification Report:
              precision    recall  f1-score   support

     anxiety       0.45      0.28      0.35        32
        fear       0.82      0.87      0.85       266
 nervousness       0.89      0.94      0.92       368
     neutral       0.95      0.94      0.94       568
     sadness       0.82      0.79      0.81       233
       shame       0.83      0.83      0.83       204
   suffering       0.73      0.55      0.63       120

   micro avg       0.87      0.86      0.86      1791
   macro avg       0.78      0.74      0.76      1791
weighted avg       0.86      0.86      0.86      1791
 samples avg       0.88      0.87      0.87      1791



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# def predict_top3_emotions(text, tokenizer, model, mlb, max_len=100):
#     # Lowercase & konversi teks
#     text = str(text).lower()

#     # Tokenisasi & padding
#     seq = tokenizer.texts_to_sequences([text])
#     padded = pad_sequences(seq, maxlen=max_len, padding='post')

#     # Prediksi
#     pred = model.predict(padded)[0]

#     # Ambil 3 skor tertinggi
#     top3_indices = np.argsort(pred)[-3:][::-1]
#     top3_labels = [mlb.classes_[i] for i in top3_indices]
#     top3_scores = [round(pred[i], 4) for i in top3_indices]

#     return list(zip(top3_labels, top3_scores))  # Hasil: [(label1, score1), ...]


In [58]:
def predict_top3_emotions(text, tokenizer, model, mlb, max_len=100):
    text = preprocess_text(text)  # Terapkan preprocessing sebelum tokenisasi
    seq = tokenizer.texts_to_sequences([text])  # Tokenisasi teks yang sudah dibersihkan
    padded = pad_sequences(seq, maxlen=max_len, padding='post')  # Padding untuk panjang seragam
    
    pred = model.predict(padded)[0]  # Prediksi dengan model
    top3_indices = np.argsort(pred)[-3:][::-1]  # Ambil 3 skor tertinggi
    top3_labels = [mlb.classes_[i] for i in top3_indices]  # Label emosi teratas
    top3_scores = [round(pred[i], 4) for i in top3_indices]  # Skor prediksi
    
    return list(zip(top3_labels, top3_scores))  # Format output: [(label1, score1), ...]


In [60]:
new_text = "Anxiety AFTER dental work I had a tooth removed today and my anxiety is super high. Not sure why..I am taking Advi for pain and Lexapro as normal..I don't like how I feel right now though so keyed up, nausea, etc."

# Terapkan preprocessing sebelum inference
cleaned_text = preprocess_text(new_text)

# Prediksi emosi berdasarkan teks yang sudah dibersihkan
top3 = predict_top3_emotions(cleaned_text, tokenizer, model, mlb)

# Cetak hasil teks setelah preprocessing dan prediksi emosi
print("Teks Asli:", new_text)
print("Teks Setelah Preprocessing:", cleaned_text)
print("Top 3 Emosi dan Skor:")
for label, score in top3:
    print(f"- {label}: {score}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Teks Asli: Anxiety AFTER dental work I had a tooth removed today and my anxiety is super high. Not sure why..I am taking Advi for pain and Lexapro as normal..I don't like how I feel right now though so keyed up, nausea, etc.
Teks Setelah Preprocessing: anxiety dental work tooth remove today anxiety super high sure whyi take advi pain lexapro normali feel right though key nausea etc
Top 3 Emosi dan Skor:
- fear: 0.9725000262260437
- nervousness: 0.9685999751091003
- sadness: 0.9358000159263611
