# Import Libraries n Load Data

In [1]:
from empath import Empath
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.preprocessing import LabelEncoder
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

ModuleNotFoundError: No module named 'empath'

In [None]:
df = pd.read_csv("anxiety_data.csv")
df.head()

Unnamed: 0,statement,label
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6092 entries, 0 to 6091
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  6092 non-null   object
 1   label      6092 non-null   object
dtypes: object(2)
memory usage: 95.3+ KB


In [None]:
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions  # For handling contractions like "I've" -> "I have"

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Add custom stopwords if needed (domain-specific words that don't carry meaning)
custom_stopwords = {'like', 'get', 'go', 'know', 'would', 'could', 'also'}
stop_words.update(custom_stopwords)

def clean_text(text):
    """
    Comprehensive text cleaning function that handles:
    - Lowercasing
    - URL removal
    - Contraction expansion
    - Non-ASCII character removal
    - Number removal
    - Punctuation removal
    - Extra whitespace removal
    """
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_text(text):
    """
    Full text preprocessing pipeline:
    1. Clean text
    2. Tokenize
    3. Remove stopwords
    4. Lemmatize tokens
    """
    # Clean the text first
    text = clean_text(text)
    
    # Tokenize
    words = text.split()
    
    # Remove stopwords and lemmatize
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 2:  # Remove short words
            # Lemmatize considering part of speech (verb by default)
            lemma = lemmatizer.lemmatize(word, pos='v')  # Try verb first
            lemma = lemmatizer.lemmatize(lemma, pos='n')  # Then noun
            lemma = lemmatizer.lemmatize(lemma, pos='a')  # Then adjective
            lemma = lemmatizer.lemmatize(lemma, pos='r')  # Then adverb
            processed_words.append(lemma)
    
    return ' '.join(processed_words)

# Ensure the column is string type
df['statement'] = df['statement'].astype(str)

# Apply preprocessing
df['cleaned_statement'] = df['statement'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\laila\AppData\Roaming\nltk_data...


In [None]:
df

Unnamed: 0,statement,label,cleaned_statement
0,oh my gosh,Anxiety,gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleep confuse mind restless heart tune
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,I've shifted my focus to something else but I'...,Anxiety,shift focus something else still worry
4,"I'm restless and restless, it's been a month n...",Anxiety,restless restless month boy mean
...,...,...,...
6087,"Help with your HIV, STD anxiety I recently got...",Anxiety,help hiv std anxiety recently get full std tes...
6088,I’ve been just kind of denying or ignoring thi...,Anxiety,kind deny ignore problem awhile occasionally e...
6089,Body Pulling after waking up. I woke up this m...,Anxiety,body pull wake wake morning immediately slam w...
6090,other health scare so about an hour ago? i wou...,Anxiety,health scare hour ago say nowhere get weird cr...


In [None]:
lexicon = Empath()

# Contoh kategori yang relevan untuk anxiety
# emotions = ['anxiety', 'fear', 'nervousness', 'sadness', 'confusion', 'suffering', 'optimism']
# emotions = ['anxiety', 'fear', 'nervousness', 'sadness', 'confusion', 'loneliness', 'shame', 'neglect']
emotions = ['anxiety', 'fear', 'nervousness', 'sadness', 'confusion', 'suffering', 'shame',]

def label_from_empath(text):
    scores = lexicon.analyze(text, categories=emotions, normalize=True)
    if scores:
        label = max(scores, key=scores.get)
        return label, scores
    return "anxiety", {}


In [None]:

# df["cleaned"] = df["statement"].astype(str).apply(preprocess_text)
df["empath_label"], df["empath_scores"] = zip(*df["cleaned_statement"].apply(label_from_empath))


In [None]:
df

Unnamed: 0,statement,label,cleaned_statement,empath_label,empath_scores,label_encoded
0,oh my gosh,Anxiety,gosh,anxiety,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",0
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleep confuse mind restless heart tune,confusion,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",5
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...,anxiety,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",0
3,I've shifted my focus to something else but I'...,Anxiety,shift focus something else still worry,nervousness,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",2
4,"I'm restless and restless, it's been a month n...",Anxiety,restless restless month boy mean,anxiety,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",0
...,...,...,...,...,...,...
6087,"Help with your HIV, STD anxiety I recently got...",Anxiety,help hiv std anxiety recently get full std tes...,nervousness,"{'anxiety': 0.0, 'fear': 0.05063291139240506, ...",2
6088,I’ve been just kind of denying or ignoring thi...,Anxiety,kind deny ignore problem awhile occasionally e...,fear,"{'anxiety': 0.0, 'fear': 0.02564102564102564, ...",1
6089,Body Pulling after waking up. I woke up this m...,Anxiety,body pull wake wake morning immediately slam w...,nervousness,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",2
6090,other health scare so about an hour ago? i wou...,Anxiety,health scare hour ago say nowhere get weird cr...,nervousness,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",2


In [None]:
# Lihat distribusi emosi berdasarkan hasil Empath
df["empath_label"].value_counts()


empath_label
nervousness    3197
fear            921
anxiety         643
shame           515
suffering       430
sadness         369
confusion        17
Name: count, dtype: int64

In [None]:
# Buat mapping ke angka sesuai urutan
emotion_to_index = {emotion: idx for idx, emotion in enumerate(emotions)}

# Terapkan ke kolom label
df['label_encoded'] = df['empath_label'].map(emotion_to_index)


In [None]:
df

Unnamed: 0,statement,label,cleaned_statement,empath_label,empath_scores,label_encoded
0,oh my gosh,Anxiety,gosh,anxiety,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",0
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleep confuse mind restless heart tune,confusion,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",4
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...,anxiety,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",0
3,I've shifted my focus to something else but I'...,Anxiety,shift focus something else still worry,nervousness,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",2
4,"I'm restless and restless, it's been a month n...",Anxiety,restless restless month boy mean,anxiety,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",0
...,...,...,...,...,...,...
6087,"Help with your HIV, STD anxiety I recently got...",Anxiety,help hiv std anxiety recently get full std tes...,nervousness,"{'anxiety': 0.0, 'fear': 0.05063291139240506, ...",2
6088,I’ve been just kind of denying or ignoring thi...,Anxiety,kind deny ignore problem awhile occasionally e...,fear,"{'anxiety': 0.0, 'fear': 0.02564102564102564, ...",1
6089,Body Pulling after waking up. I woke up this m...,Anxiety,body pull wake wake morning immediately slam w...,nervousness,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",2
6090,other health scare so about an hour ago? i wou...,Anxiety,health scare hour ago say nowhere get weird cr...,nervousness,"{'anxiety': 0.0, 'fear': 0.0, 'nervousness': 0...",2


In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# 1. Konversi label encoded ke one-hot
y = to_categorical(df['label_encoded'])

# 2. Tokenisasi teks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_statement'])

sequences = tokenizer.texts_to_sequences(df['cleaned_statement'])
X = pad_sequences(sequences, maxlen=100)

# 3. Split data untuk training dan testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=df['label_encoded']  # Penting biar distribusi label tetap
)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.layers import Layer  # For custom attention layer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Custom Attention Layer (since Keras doesn't have a built-in standalone Attention layer for this use case)
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', 
                               shape=(input_shape[-1], 1),
                               initializer='random_normal',
                               trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                shape=(input_shape[1], 1),
                                initializer='zeros',
                                trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        # Alignment scores
        e = tf.tanh(tf.matmul(x, self.W) + self.b)
        # Attention weights
        alpha = tf.nn.softmax(e, axis=1)
        # Context vector
        context = x * alpha
        context = tf.reduce_sum(context, axis=1)
        return context

# Callbacks
earlystop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)  # Increased patience
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, min_lr=1e-5)

# Model Architecture
model = Sequential()

# Embedding layer (consider using pre-trained embeddings)
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))

# Bidirectional LSTM with return_sequences=True for Attention
model.add(Bidirectional(LSTM(64, return_sequences=True)))

# Custom Attention Layer
model.add(AttentionLayer())

# Dropout with reduced rate (from 0.6 to 0.5)
model.add(Dropout(0.5))

# Dense layers
model.add(Dense(64, activation='relu'))  # Increased from 32 to 64
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))

# Optimizer with lower learning rate
optimizer = Adam(learning_rate=0.0005)  # Reduced from 0.001

model.compile(loss='categorical_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy', 
                      tf.keras.metrics.Precision(),
                      tf.keras.metrics.Recall()])

model.summary()

# Training with class weights if imbalance exists
history = model.fit(
    X_train, y_train,
    epochs=30,  # Increased max epochs
    batch_size=64,  # Increased batch size
    validation_split=0.2,
    callbacks=[earlystop, reduce_lr],
    # Add if you have imbalanced classes
)



Epoch 1/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 81ms/step - accuracy: 0.4100 - loss: 1.7479 - precision_2: 0.3925 - recall_2: 0.0633 - val_accuracy: 0.5221 - val_loss: 1.4382 - val_precision_2: 0.4970 - val_recall_2: 0.2564 - learning_rate: 5.0000e-04
Epoch 2/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 85ms/step - accuracy: 0.5319 - loss: 1.4629 - precision_2: 0.5145 - recall_2: 0.2006 - val_accuracy: 0.5221 - val_loss: 1.2995 - val_precision_2: 0.6055 - val_recall_2: 0.3415 - learning_rate: 5.0000e-04
Epoch 3/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 89ms/step - accuracy: 0.5159 - loss: 1.3547 - precision_2: 0.6134 - recall_2: 0.3064 - val_accuracy: 0.5303 - val_loss: 1.1739 - val_precision_2: 0.7358 - val_recall_2: 0.4913 - learning_rate: 5.0000e-04
Epoch 4/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 101ms/step - accuracy: 0.5562 - loss: 1.1669 - precision_2: 0.7898 - recall_2: 0.4029 -

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Evaluasi Model
y_pred = model.predict(X_test)

# Mengonversi prediksi ke kelas sebenarnya
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Print akurasi dasar
test_loss, test_acc, test_precision, test_recall = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")

# Laporan klasifikasi
print("\nClassification Report:")
print(classification_report(y_true_classes, y_pred_classes))

# Matriks kebingungan (Confusion Matrix)
print("\nConfusion Matrix:")
print(confusion_matrix(y_true_classes, y_pred_classes))


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7834 - loss: 0.8393 - precision_1: 0.8296 - recall_1: 0.7560
Test Accuracy: 0.7744
Test Precision: 0.8196
Test Recall: 0.7416

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.84      0.79       129
           1       0.68      0.70      0.69       184
           2       0.90      0.91      0.91       640
           4       0.33      0.20      0.25        74
           5       0.00      0.00      0.00         3
           6       0.51      0.44      0.47        86
           7       0.61      0.66      0.64       103

    accuracy                           0.77      1219
   macro avg       0.54      0.54      0.54      1219
weighted avg       0.76      0.77      0.77      1219


Confusion Matrix:
[[109   8   7   3   0   1   1]
 [ 22 129  20   6   0   5   2]
 [  1  26 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
model.save("emotion_model_v1.h5")

