# 1. Import library and Data

In [473]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from keras_tuner.tuners import RandomSearch
from tensorflow.keras.optimizers import Adam

In [475]:
# Load Training Dataset
df_train = pd.read_csv('./Dataset_DL/english_dataset/train/eng_t_a.csv')

In [477]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 2. Preprocessing

In [479]:
# Function to remove noise and normalize text
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text

# Example text
text = "But not very happy. I love this product! It's amazing, and works perfectly."

# Preprocess text
cleaned_text = preprocess_text(text)
print(cleaned_text)

but not very happy i love this product it amazing and work perfectly


In [481]:
# Apply the preprocess_text function to the 'text' column 
df_train['cleaned_text'] = df_train['text'].apply(preprocess_text)

In [483]:
# Extract text and labels 
X = df_train['cleaned_text'] 
y = df_train[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

# Convert labels to one-hot encoding
y = np.array(y)

In [485]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [487]:
# Tokenize text data
vocab_size = 10000  # Set a vocabulary size
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [489]:
# Pad sequences to a fixed length
max_sequence_length = 100  # Fixed max length
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# 3. Model

In [565]:
# Hyperparameter tuning function
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128))
    model.add(LSTM(units=hp.Int('lstm_units_1', min_value=64, max_value=256, step=32), return_sequences=True))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('lstm_units_2', min_value=32, max_value=128, step=16), return_sequences=False))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('dense_units', min_value=16, max_value=128, step=16), activation='relu'))
    model.add(Dropout(hp.Float('dropout_3', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=5, activation='sigmoid'))  # Multi-class classification when the labels are Multiple, independent labels
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


In [567]:
model.summary()

In [569]:
# Set up Keras Tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_results',
    project_name='sentiment_analysis_lstm'
)

In [571]:
# Search for the best hyperparameters
tuner.search(
    X_train_padded, y_train,
    epochs=10,
    validation_data=(X_test_padded, y_test),
    batch_size=64
)

Trial 10 Complete [00h 01m 01s]
val_accuracy: 0.5150421261787415

Best val_accuracy So Far: 0.5162454843521118
Total elapsed time: 00h 10m 56s


In [573]:
# Retrieve the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
history = best_model.fit(
    X_train_padded, y_train,
    epochs=10,
    validation_data=(X_test_padded, y_test),
    batch_size=64
)

Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 277ms/step - accuracy: 0.3599 - loss: 0.6390 - val_accuracy: 0.5150 - val_loss: 0.6107
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 262ms/step - accuracy: 0.5062 - loss: 0.5814 - val_accuracy: 0.5150 - val_loss: 0.5919
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 265ms/step - accuracy: 0.4795 - loss: 0.5827 - val_accuracy: 0.5150 - val_loss: 0.5770
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 263ms/step - accuracy: 0.5051 - loss: 0.5745 - val_accuracy: 0.5150 - val_loss: 0.5725
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 276ms/step - accuracy: 0.4802 - loss: 0.5780 - val_accuracy: 0.5150 - val_loss: 0.5698
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 274ms/step - accuracy: 0.4897 - loss: 0.5747 - val_accuracy: 0.5150 - val_loss: 0.5691
Epoch 7/10
[1m31/31[0m [

# 4. Evaluation

In [575]:
# Evaluate the best model
loss, accuracy = best_model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy}')
print(f"Best Hyperparameters: {best_hps.values}")

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 0.5117 - loss: 0.5685
Test Accuracy: 0.5150421261787415
Best Hyperparameters: {'lstm_units_1': 256, 'dropout_1': 0.4, 'lstm_units_2': 48, 'dropout_2': 0.30000000000000004, 'dense_units': 128, 'dropout_3': 0.2, 'learning_rate': 0.01}


In [577]:
# Unnecessary if it run locally
# Save the trained model 
best_model.save('./Model/sentiment_analysis_model.keras')

# Inferencing unlabeled dataset for the real test

In [682]:
# Load Testing Dataset
df_test = pd.read_csv('./Dataset_DL/english_dataset/test/eng_a.csv')

df_test_top = df_test[['id', 'text']].copy()

In [684]:
# Apply the preprocess_text function to the 'text' column 
df_test['text'] = df_test['text'].apply(preprocess_text)

In [686]:
# Tokenize the text data 
vocab_size = 10000  # Set a vocabulary size
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>') 
tokenizer.fit_on_texts(df_test['text']) 
new_sequences = tokenizer.texts_to_sequences(df_test['text'])

In [688]:
# Pad sequences to a fixed length
max_sequence_length = 100  # Fixed max length
new_padded = pad_sequences(new_sequences, maxlen=max_sequence_length, padding='post')

In [690]:
# Unnecessary if it run locally
# from tensorflow.keras.models import load_model

# Load the trained model 
#model = load_model('./Model/sentiment_analysis_model.keras')

## Using the threshold 0.5 to classify whether an emotion is present.

In [693]:
# Make predictions on the new data 
predictions = best_model.predict(new_padded)

# Define a threshold for classification
threshold = 0.5

# Convert predictions to readable format 
predicted_labels = (predictions > threshold).astype(int)

# Combine with the new dataset 
df_test[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']] = predicted_labels

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step


In [709]:
for i in range(10):
    print(predictions[i])

[0.13138212 0.5473095  0.28300503 0.342344   0.3279194 ]
[0.13138212 0.5473095  0.28300503 0.342344   0.3279194 ]
[0.13138212 0.5473095  0.28300503 0.342344   0.3279194 ]
[0.13138212 0.5473095  0.28300503 0.342344   0.32791942]
[0.13138212 0.5473095  0.28300503 0.342344   0.32791942]
[0.13138212 0.5473095  0.28300503 0.342344   0.32791942]
[0.13138212 0.5473095  0.28300503 0.342344   0.32791942]
[0.13138212 0.5473095  0.28300503 0.342344   0.32791942]
[0.13138212 0.5473095  0.28300503 0.342344   0.3279194 ]
[0.13138212 0.5473095  0.28300503 0.342344   0.32791942]


In [695]:
df_test

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,my mouth fell open no no no i,0,1,0,0,0
1,eng_dev_track_a_00002,you can barely make out your daughter pale for...,0,1,0,0,0
2,eng_dev_track_a_00003,but after blinking my eye for a few time lepas...,0,1,0,0,0
3,eng_dev_track_a_00004,slowly rising to my foot i came to the conclus...,0,1,0,0,0
4,eng_dev_track_a_00005,i noticed this month after moving in and doing...,0,1,0,0,0
...,...,...,...,...,...,...,...
111,eng_dev_track_a_00112,arch stop your progression,0,1,0,0,0
112,eng_dev_track_a_00113,this star start to move across the sky,0,1,0,0,0
113,eng_dev_track_a_00114,and my foot hurt,0,1,0,0,0
114,eng_dev_track_a_00115,so i cried my eye out and did the drawing,0,1,0,0,0


In [697]:
# Count the occurrences of 1 in each label column 
anger_count = df_test['Anger'].sum() 
fear_count = df_test['Fear'].sum() 
joy_count = df_test['Joy'].sum() 
sadness_count = df_test['Sadness'].sum() 
surprise_count = df_test['Surprise'].sum()

print("Count of value 1 in each label column:") 
print(f"Anger: {anger_count}") 
print(f"Fear: {fear_count}") 
print(f"Joy: {joy_count}") 
print(f"Sadness: {sadness_count}") 
print(f"Surprise: {surprise_count}")

Count of value 1 in each label column:
Anger: 0
Fear: 116
Joy: 0
Sadness: 0
Surprise: 0


## Using TOP-N Selection as the most likely emotions.

In [700]:
# Define emotions corresponding to model output columns
emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

# Number of top emotions to select
top_n = 2

# Function to get the top-N emotions for each prediction row
def get_top_n_emotions(prediction_row, emotions, top_n):
    # Get indices of top-N emotions
    top_indices = prediction_row.argsort()[-top_n:][::-1]
    # Map indices to emotion names
    return [emotions[i] for i in top_indices]

In [702]:
# Apply the function to all predictions
df_test_top['Top_2_Emotions'] = [get_top_n_emotions(row, emotions, top_n) for row in predictions]

# (Optional) Add probabilities of the top-N emotions
df_test_top['Top_2_Probs'] = [sorted(row, reverse=True)[:top_n] for row in predictions]

In [704]:
# Optional: Display the DataFrame
print(df_test_top[['Top_2_Emotions', 'Top_2_Probs']])

      Top_2_Emotions              Top_2_Probs
0    [Fear, Sadness]    [0.5473095, 0.342344]
1    [Fear, Sadness]    [0.5473095, 0.342344]
2    [Fear, Sadness]    [0.5473095, 0.342344]
3    [Fear, Sadness]    [0.5473095, 0.342344]
4    [Fear, Sadness]    [0.5473095, 0.342344]
..               ...                      ...
111  [Fear, Sadness]    [0.5473095, 0.342344]
112  [Fear, Sadness]    [0.5473095, 0.342344]
113  [Fear, Sadness]  [0.5473095, 0.34234402]
114  [Fear, Sadness]    [0.5473095, 0.342344]
115  [Fear, Sadness]    [0.5473095, 0.342344]

[116 rows x 2 columns]


In [706]:
df_test_top

Unnamed: 0,id,text,Top_2_Emotions,Top_2_Probs
0,eng_dev_track_a_00001,"My mouth fell open `` No, no, no... I..","[Fear, Sadness]","[0.5473095, 0.342344]"
1,eng_dev_track_a_00002,You can barely make out your daughter's pale f...,"[Fear, Sadness]","[0.5473095, 0.342344]"
2,eng_dev_track_a_00003,But after blinking my eyes for a few times lep...,"[Fear, Sadness]","[0.5473095, 0.342344]"
3,eng_dev_track_a_00004,Slowly rising to my feet I came to the conclus...,"[Fear, Sadness]","[0.5473095, 0.342344]"
4,eng_dev_track_a_00005,I noticed this months after moving in and doin...,"[Fear, Sadness]","[0.5473095, 0.342344]"
...,...,...,...,...
111,eng_dev_track_a_00112,"""ARcH stop your progression.","[Fear, Sadness]","[0.5473095, 0.342344]"
112,eng_dev_track_a_00113,"This 'star', starts to move across the sky.","[Fear, Sadness]","[0.5473095, 0.342344]"
113,eng_dev_track_a_00114,and my feet hurt.,"[Fear, Sadness]","[0.5473095, 0.34234402]"
114,eng_dev_track_a_00115,so i cried my eyes out and did the drawing.,"[Fear, Sadness]","[0.5473095, 0.342344]"
