# 1. Import library

In [1]:
# 1. Import libraries (sama seperti sebelumnya)
from empath import Empath
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Layer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import classification_report, multilabel_confusion_matrix, hamming_loss

In [2]:
# NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inisialisasi
lexicon = Empath()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

custom_stopwords = {'like', 'get', 'go', 'know', 'would', 'could', 'also'}
stop_words.update(custom_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 2. Data Loading

In [3]:
df = pd.read_csv("datasets.csv")
df.head()

Unnamed: 0,statement,label
0,Leaves are also standby in front of the PC ......,Normal
1,"Bismillah for Eid 2021, you get a few question...",Normal
2,I want to spend a lot of time shopping for sna...,Normal
3,"I like to be grateful, don't you think, if you...",Normal
4,why is this person blg parcel hampers blah bla...,Normal


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11053 entries, 0 to 11052
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  11053 non-null  object
 1   label      11053 non-null  object
dtypes: object(2)
memory usage: 172.8+ KB


# 3. Preprocessing

## 3.1 Cleaning n lemmetizing

In [5]:
def clean_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text(text):
    text = clean_text(text)
    words = text.split()
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 2:
            lemma = lemmatizer.lemmatize(word, pos='v')
            lemma = lemmatizer.lemmatize(lemma, pos='n')
            lemma = lemmatizer.lemmatize(lemma, pos='a')
            lemma = lemmatizer.lemmatize(lemma, pos='r')
            processed_words.append(lemma)
    return ' '.join(processed_words)

df['statement'] = df['statement'].astype(str)
df['cleaned_statement'] = df['statement'].apply(preprocess_text)


In [None]:
#Daftar emosi dari Empath yang relevan
emotions = ['anxiety', 'fear', 'nervousness', 'sadness', 'suffering', 'shame']

def label_from_empath(text):
    scores = lexicon.analyze(text, categories=emotions, normalize=True)
    return scores


In [7]:
emotion_keywords = [
    'sleep', 'restless', 'panic', 'worried', 'scared',
    'cry', 'sad', 'guilt', 'confused', 'fear',
    'dizzy', 'pressure', 'tired', 'alone'
]

def contains_emotion_keyword(text):
    return any(re.search(rf'\b{kw}\b', text.lower()) for kw in emotion_keywords)


In [8]:
def top_n_emotions_with_fallback(score_dict, text, label, n=3):
    if label.lower() == 'normal':
        return ['neutral']
    if not isinstance(score_dict, dict) or sum(score_dict.values()) < 0.05:
        if contains_emotion_keyword(text):
            return ['anxiety']  # fallback default jika keywords ada
        return ['neutral']
    sorted_items = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top = [emotion for emotion, score in sorted_items[:n] if score > 0]
    return top if top else ['neutral']


In [9]:
df['empath_scores'] = df['cleaned_statement'].apply(label_from_empath)

df['top_emotions'] = df.apply(
    lambda row: top_n_emotions_with_fallback(
        row['empath_scores'], row['cleaned_statement'], row['label']
    ), axis=1
)


In [10]:
def extract_top_emotions_separate(score_dict, text, label, n=3):
    """
    Extract top 3 emotions ke kolom terpisah
    """
    if label.lower() == 'normal':
        return ['neutral', None, None]
    
    if not isinstance(score_dict, dict) or sum(score_dict.values()) < 0.05:
        if contains_emotion_keyword(text):
            return ['anxiety', None, None]
        return ['neutral', None, None]
    
    sorted_items = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top_emotions = [emotion for emotion, score in sorted_items[:n] if score > 0]
    
    # Pad dengan None jika kurang dari 3
    while len(top_emotions) < 3:
        top_emotions.append(None)
    
    return top_emotions[:3]

In [11]:
top_emotions_list = df.apply(
    lambda row: extract_top_emotions_separate(
        row['empath_scores'], row['cleaned_statement'], row['label']
    ), axis=1
)

In [12]:
df['top_1'] = [emotions[0] for emotions in top_emotions_list]
df['top_2'] = [emotions[1] for emotions in top_emotions_list]
df['top_3'] = [emotions[2] for emotions in top_emotions_list]

print("Sample data with separate emotion columns:")
print(df[['statement', 'label', 'top_1', 'top_2', 'top_3']].head(10))


Sample data with separate emotion columns:
                                           statement   label    top_1 top_2  \
0  Leaves are also standby in front of the PC ......  Normal  neutral  None   
1  Bismillah for Eid 2021, you get a few question...  Normal  neutral  None   
2  I want to spend a lot of time shopping for sna...  Normal  neutral  None   
3  I like to be grateful, don't you think, if you...  Normal  neutral  None   
4  why is this person blg parcel hampers blah bla...  Normal  neutral  None   
5  I want to take a day off from work and then ta...  Normal  neutral  None   
6  How many bbl tickets are now? How come I want ...  Normal  neutral  None   
7  I bought a shirt for 200, but I've never worn ...  Normal  neutral  None   
8  Chinese foreigners can enter at will because t...  Normal  neutral  None   
9  09.35 WIB #Tol_JORR_E TMII - Cikunir - Cakung ...  Normal  neutral  None   

  top_3  
0  None  
1  None  
2  None  
3  None  
4  None  
5  None  
6  None  
7  None

In [13]:
df.to_csv("top123.csv")