In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report


In [26]:
# Load the dataset
import pandas as pd
file_path = "/Users/celinewu/Documents/GitHub/2024-25c-fai2-adsai-group-group16/Task_4/ver_2_FINAL_DATASET.xlsx" 
df = pd.read_excel(file_path)

In [27]:
# Extract sentences and labels
sentences = df["Sentence"].astype(str).tolist()
labels = df["main_category"].astype(str).tolist()

In [28]:
# Extract Sentiment Scores
sentiment_scores = df["Sentiment_Score"].values.reshape(-1, 1)

# Normalize Sentiment Scores
scaler = MinMaxScaler()
sentiment_scores = scaler.fit_transform(sentiment_scores)

In [29]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1

In [30]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(sentences)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post")

In [31]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

In [32]:
# Split dataset
X_train, X_test, y_train, y_test, X_train_sent, X_test_sent = train_test_split(
    padded_sequences, encoded_labels, sentiment_scores, 
    test_size=0.2, random_state=42, stratify=encoded_labels
)

In [33]:
# Convert labels to categorical
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [34]:
# Build RNN Model
text_input = Input(shape=(max_length,), name="text_input")
embedding = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length)(text_input)
rnn_layer = SimpleRNN(128, return_sequences=True)(embedding)
rnn_layer = SimpleRNN(64)(rnn_layer)
dense_text = Dense(64, activation='relu')(rnn_layer)




In [35]:
# Sentiment Score input
sentiment_input = Input(shape=(1,), name="sentiment_input")
sentiment_dense = Dense(8, activation='relu')(sentiment_input)  # Process sentiment score

# Merge both models
merged = Concatenate()([dense_text, sentiment_dense])
output = Dense(num_classes, activation='softmax')(merged)




In [36]:
# Define final model
model = tf.keras.Model(inputs=[text_input, sentiment_input], outputs=output)


In [37]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# Define Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("best_rnn_model.keras", monitor='val_accuracy', save_best_only=True)

# Train the model
history = model.fit(
    [X_train, X_train_sent], y_train,
    epochs=20, batch_size=32, validation_data=([X_test, X_test_sent], y_test),
    callbacks=[early_stopping, model_checkpoint]
)


Epoch 1/20


2025-03-03 12:20:27.799257: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 1s/step - accuracy: 0.1451 - loss: 2.0251 - val_accuracy: 0.1461 - val_loss: 2.0059
Epoch 2/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 1s/step - accuracy: 0.1427 - loss: 1.9753 - val_accuracy: 0.1326 - val_loss: 1.9725
Epoch 3/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 3s/step - accuracy: 0.1392 - loss: 1.9757 - val_accuracy: 0.1792 - val_loss: 1.9470
Epoch 4/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 1s/step - accuracy: 0.1587 - loss: 1.9536 - val_accuracy: 0.1470 - val_loss: 1.9435
Epoch 5/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 1s/step - accuracy: 0.1760 - loss: 1.9424 - val_accuracy: 0.1487 - val_loss: 1.9387
Epoch 6/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 1s/step - accuracy: 0.1743 - loss: 1.9427 - val_accuracy: 0.2231 - val_loss: 1.9450
Epoch 7/20
[1m140/140[0m [32m━

In [38]:
# Evaluate the model
test_loss, test_acc = model.evaluate([X_test, X_test_sent], y_test)
print(f"Test Accuracy: {test_acc:.4f}")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 119ms/step - accuracy: 0.1812 - loss: 1.9211
Test Accuracy: 0.1765


In [39]:
# Get model predictions
y_pred = model.predict([X_test, X_test_sent])

# Convert predictions from one-hot encoding to class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Generate classification report
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 115ms/step
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00       159
     disgust       0.41      0.17      0.24       159
        fear       0.00      0.00      0.00       159
   happiness       0.16      0.93      0.27       160
     neutral       0.20      0.13      0.16       160
     sadness       0.00      0.00      0.00       159
    surprise       0.00      0.00      0.00       160

    accuracy                           0.18      1116
   macro avg       0.11      0.18      0.10      1116
weighted avg       0.11      0.18      0.10      1116



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
# Save the model and tokenizer
model.save("rnn_sentiment_score_model.keras")
# Save the tokenizer
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)