In [3]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, Concatenate
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report

In [1]:
# Load the CSV file into a pandas DataFrame
df = pd.read_excel('/Users/celinewu/Documents/GitHub/2024-25c-fai2-adsai-group-group16/Task_5/FINAL_DATASET.xlsx') 

# Give emotions a numerical value 
emotion_mapping = {
    "happiness": 0,
    "sadness": 1,
    "surprise": 2,
    "anger": 3,
    "disgust": 4,
    "fear": 5,
    "neutral": 6
}
df['label_num'] = df['Corrected_Emotion'].map(emotion_mapping)

# One-hot encode the labels
num_classes = len(emotion_mapping)
y = to_categorical(df['label_num'], num_classes=num_classes)

# 3. Preprocess the Text Data
max_words = 5000        
max_seq_length = 100     

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Sentence'])
sequences = tokenizer.texts_to_sequences(df['Sentence'])
X_text = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

# 4. Extract the Sentiment Feature Using VADER
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def extract_sentiment(text):
    """Extract the compound sentiment score from text."""
    return sia.polarity_scores(text)['compound']

# Compute sentiment score for each text sample
df['sentiment'] = df['Sentence'].apply(extract_sentiment)
X_sentiment = np.array(df['sentiment']).reshape(-1, 1)

# 5. Split the Data into Training and Validation Sets
X_text_train, X_text_val, X_sentiment_train, X_sentiment_val, y_train, y_val = train_test_split(
    X_text, X_sentiment, y, test_size=0.2, random_state=42
)

# 6. Build the RNN Model with the Sentiment Feature
embedding_dim = 128   
rnn_units = 64       

# Input branch for text sequences
text_input = Input(shape=(max_seq_length,), name="text_input")
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_seq_length)(text_input)
rnn_output = SimpleRNN(rnn_units)(embedding_layer)

# Input branch for sentiment feature (a single numerical value)
sentiment_input = Input(shape=(1,), name="sentiment_input")

# Combine the outputs from both branches
combined = Concatenate()([rnn_output, sentiment_input])

# Add one or more Dense layers for classification
dense = Dense(64, activation='relu')(combined)
output = Dense(num_classes, activation='softmax')(dense)

# Define and compile the model
model = Model(inputs=[text_input, sentiment_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# 7. Train the Model
history = model.fit(
    [X_text_train, X_sentiment_train],
    y_train,
    epochs=10,              
    batch_size=32,          
    validation_data=([X_text_val, X_sentiment_val], y_val)
)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/celinewu/nltk_data...
2025-02-26 10:32:05.927319: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-02-26 10:32:05.927351: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-26 10:32:05.927355: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-02-26 10:32:05.927529: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-26 10:32:05.927542: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


2025-02-26 10:32:06.552947: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m886s[0m 9s/step - accuracy: 0.5221 - loss: 1.4356 - val_accuracy: 0.6459 - val_loss: 1.1471
Epoch 2/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 5s/step - accuracy: 0.6271 - loss: 1.2078 - val_accuracy: 0.6459 - val_loss: 1.2138
Epoch 3/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 5s/step - accuracy: 0.6192 - loss: 1.2538 - val_accuracy: 0.6459 - val_loss: 1.1423
Epoch 4/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 5s/step - accuracy: 0.6152 - loss: 1.2634 - val_accuracy: 0.6459 - val_loss: 1.1402
Epoch 5/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 5s/step - accuracy: 0.6069 - loss: 1.2662 - val_accuracy: 0.6459 - val_loss: 1.1578
Epoch 6/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 5s/step - accuracy: 0.6358 - loss: 1.2000 - val_accuracy: 0.6459 - val_loss: 1.1458
Epoch 7/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━

In [4]:
# ---------------------------
# 8. Evaluate the Model with F1-score
# ---------------------------
y_pred_probs = model.predict([X_text_val, X_sentiment_val])
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels
y_true = np.argmax(y_val, axis=1)  # Convert one-hot encoded labels back to class labels

# Compute F1-score and classification report
report = classification_report(y_true, y_pred, target_names=emotion_mapping.keys(), digits=4)
print(report)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 320ms/step
              precision    recall  f1-score   support

   happiness     0.6667    0.0149    0.0292       134
     sadness     0.0000    0.0000    0.0000        34
    surprise     0.0000    0.0000    0.0000        62
       anger     0.0000    0.0000    0.0000        12
     disgust     0.0000    0.0000    0.0000         5
        fear     0.0000    0.0000    0.0000        26
     neutral     0.6471    0.9980    0.7852       498

    accuracy                         0.6472       771
   macro avg     0.1877    0.1447    0.1163       771
weighted avg     0.5339    0.6472    0.5122       771



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
