In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import gensim.downloader as api

In [2]:
# Load dataset
file_path = "/Users/celinewu/Documents/GitHub/2024-25c-fai2-adsai-group-group16/Task_4/ver_2_FINAL_DATASET.xlsx"
df = pd.read_excel(file_path)

In [3]:
# Extract text and labels
sentences = df["Sentence"].astype(str).tolist()
labels = df["main_category"].astype(str).tolist()

# Extract and normalize Sentiment Scores
scaler = MinMaxScaler()
df["Sentiment_Score"] = scaler.fit_transform(df[["Sentiment_Score"]])

# Extract POS Tag Features
from collections import Counter
pos_categories = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'CONJ']

def extract_pos_features(pos_tags):
    pos_counts = Counter(pos_tags.split())
    return [pos_counts.get(pos, 0) for pos in pos_categories]

pos_features = np.array([extract_pos_features(pos) for pos in df["POS_Tags"]])
pos_df = pd.DataFrame(scaler.fit_transform(pos_features), columns=pos_categories)

# Extract and normalize TF-IDF Features
tfidf_features = np.array([np.fromstring(vec.strip("[]"), sep=' ') for vec in df["TF_IDF"]])
tfidf_features = scaler.fit_transform(tfidf_features)


  tfidf_features = np.array([np.fromstring(vec.strip("[]"), sep=' ') for vec in df["TF_IDF"]])


In [4]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
max_length = 30  
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post")


In [5]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

# Split dataset
X_train, X_test, y_train, y_test, X_train_sent, X_test_sent, X_train_tfidf, X_test_tfidf, X_train_pos, X_test_pos = train_test_split(
    padded_sequences, encoded_labels, df["Sentiment_Score"].values, tfidf_features, pos_df.values,
    test_size=0.2, random_state=42, stratify=encoded_labels
)


In [6]:
# Convert labels to categorical
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove_model = api.load("glove-wiki-gigaword-50")
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]


Loading GloVe embeddings...


In [7]:
# Define text input and embedding layer
text_input = Input(shape=(max_length,), name="text_input")
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                      weights=[embedding_matrix], input_length=max_length, 
                      trainable=True)(text_input)  # Train embeddings

# Define Bidirectional GRU layers
rnn_layer = Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.2))(embedding)
rnn_layer = Bidirectional(GRU(64, return_sequences=True, recurrent_dropout=0.2))(rnn_layer)
rnn_layer = Bidirectional(GRU(32, return_sequences=False, recurrent_dropout=0.2))(rnn_layer)  # Fix applied
dense_text = Dense(64, activation='relu')(rnn_layer)


# Define Sentiment Score input
sentiment_input = Input(shape=(1,), name="sentiment_input")
sentiment_dense = Dense(8, activation='relu')(sentiment_input)

# Define TF-IDF input
tfidf_input = Input(shape=(tfidf_features.shape[1],), name="tfidf_input")
tfidf_dense = Dense(32, activation='relu')(tfidf_input)

# Define POS Tags input
pos_input = Input(shape=(len(pos_categories),), name="pos_input")
pos_dense = Dense(16, activation='relu')(pos_input)



2025-03-06 12:22:43.557955: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-03-06 12:22:43.557989: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-03-06 12:22:43.557993: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-03-06 12:22:43.558013: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-03-06 12:22:43.558028: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
# Merge inputs
merged = Concatenate()([dense_text, sentiment_dense, tfidf_dense, pos_dense])
merged = Dropout(0.3)(merged)  # Apply dropout
output = Dense(num_classes, activation='softmax')(merged)  # Use softmax for classification

# Define final model
model = Model(inputs=[text_input, sentiment_input, tfidf_input, pos_input], outputs=output)
optimizer = Adam(learning_rate=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


In [9]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(encoded_labels), y=encoded_labels)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("rnn_pos_tfidf_sent_it5.keras", monitor='val_accuracy', save_best_only=True)


In [10]:
# Train the model
print("Training model...")
history = model.fit(
    [X_train, X_train_sent, X_train_tfidf, X_train_pos], y_train,
    epochs=30, batch_size=64, validation_data=([X_test, X_test_sent, X_test_tfidf, X_test_pos], y_test),
    class_weight=class_weight_dict,
    callbacks=[early_stopping, model_checkpoint]
)


Training model...
Epoch 1/30


2025-03-06 12:22:45.756474: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m428s[0m 6s/step - accuracy: 0.1410 - loss: 2.0245 - val_accuracy: 0.1604 - val_loss: 1.9453
Epoch 2/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 6s/step - accuracy: 0.1580 - loss: 1.9661 - val_accuracy: 0.1658 - val_loss: 1.9378
Epoch 3/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m530s[0m 8s/step - accuracy: 0.1645 - loss: 1.9599 - val_accuracy: 0.1846 - val_loss: 1.9312
Epoch 4/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 5s/step - accuracy: 0.1774 - loss: 1.9404 - val_accuracy: 0.1935 - val_loss: 1.9262
Epoch 5/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 6s/step - accuracy: 0.1932 - loss: 1.9265 - val_accuracy: 0.2097 - val_loss: 1.9192
Epoch 6/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 6s/step - accuracy: 0.2145 - loss: 1.9185 - val_accuracy: 0.2213 - val_loss: 1.9134
Epoch 7/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━

In [11]:
# Evaluate the model
test_loss, test_acc = model.evaluate([X_test, X_test_sent, X_test_tfidf, X_test_pos], y_test)
print(f"Test Accuracy: {test_acc:.4f}")



[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 310ms/step - accuracy: 0.3463 - loss: 1.7314
Test Accuracy: 0.3145


In [12]:
# Save tokenizer
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [13]:
# Generate F1-score report
from sklearn.metrics import classification_report

# Get predictions
y_pred = model.predict([X_test, X_test_sent, X_test_tfidf, X_test_pos])

# Convert predictions from one-hot encoding to class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 322ms/step

Classification Report:
              precision    recall  f1-score   support

       anger       0.27      0.31      0.29       159
     disgust       0.38      0.33      0.35       159
        fear       0.36      0.27      0.31       159
   happiness       0.50      0.49      0.50       160
     neutral       0.24      0.17      0.20       160
     sadness       0.27      0.33      0.29       159
    surprise       0.23      0.29      0.26       160

    accuracy                           0.31      1116
   macro avg       0.32      0.31      0.31      1116
weighted avg       0.32      0.31      0.31      1116

