# 1. Import library

In [23]:
# 1. Import libraries (sama seperti sebelumnya)
from empath import Empath
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Layer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import classification_report, multilabel_confusion_matrix, hamming_loss

In [24]:
# NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inisialisasi
lexicon = Empath()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

custom_stopwords = {'like', 'get', 'go', 'know', 'would', 'could', 'also'}
stop_words.update(custom_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 2. Data Loading

In [25]:
df = pd.read_csv("filtered.csv")
df.head()

Unnamed: 0,statement,label
0,Leaves are also standby in front of the PC ......,Normal
1,"Bismillah for Eid 2021, you get a few question...",Normal
2,I want to spend a lot of time shopping for sna...,Normal
3,"I like to be grateful, don't you think, if you...",Normal
4,why is this person blg parcel hampers blah bla...,Normal


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10299 entries, 0 to 10298
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  10299 non-null  object
 1   label      10299 non-null  object
dtypes: object(2)
memory usage: 161.1+ KB


# 3. Preprocessing

## 3.1 Cleaning n lemmetizing

In [27]:
def clean_text(text):
    text = text.lower()  # Mengubah teks menjadi huruf kecil 
    text = contractions.fix(text)  # Memperbaiki kontraksi 
    text = re.sub(r'http\S+|www\S+', '', text)  # Menghapus URL 
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Menghapus karakter non-ASCII 
    text = re.sub(r'\d+', '', text)  # Menghapus angka 
    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)  # Menghapus tanda baca 
    text = re.sub(r'\s+', ' ', text).strip()  # Ganti multiple spasi dengan 1 spasi)
    return text  # Mengembalikan teks yang sudah dibersihkan

def preprocess_text(text):
    text = clean_text(text) 
    words = text.split()  # Memisahkan teks menjadi list kata 
    processed_words = []  # List untuk menyimpan kata yang sudah diproses
    for word in words:
        if word not in stop_words and len(word) > 2:  # Filter: hapus stopword dan kata dengan panjang ≤ 2
            lemma = lemmatizer.lemmatize(word, pos='v')  # Lemmatisasi sebagai verb (e.g., "running" → "run")
            lemma = lemmatizer.lemmatize(lemma, pos='n')  # Lemmatisasi sebagai noun (e.g., "wolves" → "wolf")
            lemma = lemmatizer.lemmatize(lemma, pos='a')  # Lemmatisasi sebagai adjective (e.g., "better" → "good")
            lemma = lemmatizer.lemmatize(lemma, pos='r')  # Lemmatisasi sebagai adverb (e.g., "quickly" → "quick")
            processed_words.append(lemma)  # Tambahkan kata yang sudah dilematisasi ke list
    return ' '.join(processed_words)  # Gabungkan list kata menjadi teks dengan spasi

df['statement'] = df['statement'].astype(str)
df['cleaned_statement'] = df['statement'].apply(preprocess_text)


In [28]:
#Daftar emosi dari Empath yang relevan
emotions = ['anxiety', 'fear', 'nervousness', 'sadness', 'suffering', 'shame']

#Analisis teks menggunakan lexicon Empath, dengan kategori yang telah didefinisikan 
def label_from_empath(text):
    scores = lexicon.analyze(text, categories=emotions, normalize=True)
    return scores


In [29]:
emotion_keywords = [
    'sleep', 'restless', 'panic', 'worried', 'scared',
    'cry', 'sad', 'guilt', 'confused', 'fear',
    'dizzy', 'pressure', 'tired', 'alone', 'anxious', 'hopeless', 'worthless', 'suicidal', 
    'overwhelmed', 'isolated', 'numb', 'empty',
    'heartbroken', 'misery', 'despair', 'regret'
]

def contains_emotion_keyword(text):
    return any(re.search(rf'\b{kw}\b', text.lower()) for kw in emotion_keywords)


In [36]:
def top_n_emotions_with_fallback(score_dict, text, label, n=3):
    if label.lower() == 'normal':
        return ['neutral']
    
    # Jika bukan dictionary atau jika kosong, langsung fallback ke 'anxiety'
    if not isinstance(score_dict, dict) or not score_dict:
        return ['anxiety']  

    sorted_items = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top = [emotion for emotion, score in sorted_items[:n] if score > 0]

    # Jika tidak ada emosi dengan skor positif, fallback ke 'anxiety' saja
    return top if top else ['anxiety']


In [37]:
df['empath_scores'] = df['cleaned_statement'].apply(label_from_empath)

df['top_emotions'] = df.apply(
    lambda row: top_n_emotions_with_fallback(
        row['empath_scores'], row['cleaned_statement'], row['label']
    ), axis=1
)


In [42]:
def extract_top_emotions_separate(score_dict, text, label, n=3):
    """
    Extract top 3 emotions ke kolom terpisah
    """
    if label.lower() == 'normal':
        return ['neutral', None, None]
    
    # Jika bukan dictionary atau kosong, langsung fallback ke 'anxiety'
    if not isinstance(score_dict, dict) or not score_dict:
        return ['anxiety', None, None]
    
    sorted_items = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top_emotions = [emotion for emotion, score in sorted_items[:n] if score > 0]
    
    # Jika tidak ada emosi dengan skor positif, fallback ke 'anxiety'
    if not top_emotions:
        return ['anxiety', None, None]
    
    # Pad dengan None jika kurang dari 3
    while len(top_emotions) < 3:
        top_emotions.append(None)
    
    return top_emotions[:3]


In [43]:
top_emotions_list = df.apply(
    lambda row: extract_top_emotions_separate(
        row['empath_scores'], row['cleaned_statement'], row['label']
    ), axis=1
)

In [44]:
df['top_1'] = [emotions[0] for emotions in top_emotions_list]
df['top_2'] = [emotions[1] for emotions in top_emotions_list]
df['top_3'] = [emotions[2] for emotions in top_emotions_list]

print("Sample data with separate emotion columns:")
print(df[['statement', 'label', 'top_1', 'top_2', 'top_3']].head(10))


Sample data with separate emotion columns:
                                           statement   label    top_1 top_2  \
0  Leaves are also standby in front of the PC ......  Normal  neutral  None   
1  Bismillah for Eid 2021, you get a few question...  Normal  neutral  None   
2  I want to spend a lot of time shopping for sna...  Normal  neutral  None   
3  I like to be grateful, don't you think, if you...  Normal  neutral  None   
4  why is this person blg parcel hampers blah bla...  Normal  neutral  None   
5  I want to take a day off from work and then ta...  Normal  neutral  None   
6  How many bbl tickets are now? How come I want ...  Normal  neutral  None   
7  I bought a shirt for 200, but I've never worn ...  Normal  neutral  None   
8  Chinese foreigners can enter at will because t...  Normal  neutral  None   
9  09.35 WIB #Tol_JORR_E TMII - Cikunir - Cakung ...  Normal  neutral  None   

  top_3  
0  None  
1  None  
2  None  
3  None  
4  None  
5  None  
6  None  
7  None

In [45]:
df.to_csv("top123-2.csv")