# Import Libraries


In [1]:
import pickle
import re
import string
import emoji
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Dataset


In [3]:
with open('./processed_objects/features_data.npy', 'rb') as file:
    train_data = np.load(file)
    val_data = np.load(file)
    test_data = np.load(file)

with open('./processed_objects/target_data.npy', 'rb') as file:
    train_target = np.load(file)
    val_target = np.load(file)
    test_target = np.load(file)

with open('./processed_objects/word_index.pkl', 'rb') as file:
    word_index = pickle.load(file)

with open('./processed_objects/pretrained_embedding_matrices.npy', 'rb') as file:
    pt_fasttext_Matrix = np.load(file)

# Helper Functions


In [4]:
max_len = 20


def embeddings_to_keras(matrix, max_len, train_embeddings=False):
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=matrix.shape[0],
        output_dim=matrix.shape[1],
        weights=[matrix],
        mask_zero=True,
        trainable=train_embeddings
    )

    return embedding_layer


def casefold(text):
    return text.lower()


def replace_punctuations(text):
    punctuations = set(string.punctuation)
    for char in text:
        if char in punctuations:
            text = text.replace(char, ' ')
    return text


def clear_emoji(text):
    return emoji.replace_emoji(text, ' ')


def tokenize_text(text):
    text = word_tokenize(text)
    return text


def lemmatize_tokens(word_tokens, word_lemmatizer):
    lemmatized_tokens = [word_lemmatizer.lemmatize(
        word) for word in word_tokens]
    return lemmatized_tokens


def remove_stopwords(word_tokens, stopwords_set):
    text = ' '.join(
        [word for word in word_tokens if word not in stopwords_set])
    text = text.strip()
    return text


lemmatizer = WordNetLemmatizer()
stopwords_English = set(stopwords.words('english'))


def complete_clean(text):
    text = clear_emoji(text)
    text = casefold(text)
    text = re.sub(r'[0-9]+', ' ', text)
    text = text.replace("'", "'")
    text = text.replace("’", "'")
    text = text.replace("´", "'")
    text = text.replace("-", " ")
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = replace_punctuations(text)
    text = tokenize_text(text)
    text = lemmatize_tokens(text, word_lemmatizer=lemmatizer)
    text = remove_stopwords(text, stopwords_English)
    text = text.strip()
    return text

# Training and Validation


## Load Embeddings into Embedding Layers


In [5]:
pt_fasttext_Embedding = embeddings_to_keras(pt_fasttext_Matrix, max_len)

## Modelling with Hyperparameter Tuning


In [13]:
def build_model(hp):
    embedding = pt_fasttext_Embedding
    recurrent_units = 25

    first_dense_units = hp.Int(
        'first_dense_units', 16, 512, step=16, default=128)
    second_dense_units = hp.Int(
        'second_dense_units', 16, 512, step=16, default=64)

    optimizer_name = hp.Choice('optimizer',
                               values=['adam', 'adamw'],
                               default='adam')
    learning_rate = hp.Float('learning_rate', 1e-4, 1e-2,
                             sampling='log', default=1e-3)
    lr_multiplier = hp.Float('lr_multiplier', 0.5, 2.0, default=1.0)

    dropout_rate = hp.Float('dropout_rate', 0.2, 0.5,
                            step=0.05, default=0.5)

    rnn_type = hp.Choice('rnn_type', values=['LSTM', 'GRU'], default='LSTM')
    if rnn_type == 'LSTM':
        reccurent_layer = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(recurrent_units, return_sequences=False)
        )
    else:
        reccurent_layer = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(recurrent_units, return_sequences=False)
        )

    model = tf.keras.Sequential([
        embedding,
        reccurent_layer,
        tf.keras.layers.Dense(
            first_dense_units, activation='relu'
        ),
        tf.keras.layers.Dense(
            second_dense_units, activation='relu'
        ),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(rate=dropout_rate),
        tf.keras.layers.Dense(
            5, activation='sigmoid'
        ),
    ])

    actual_lr = learning_rate * lr_multiplier
    if optimizer_name == 'adam':
        optimizer = tf.keras.optimizers.Adam(actual_lr)
    else:
        optimizer = tf.keras.optimizers.AdamW(actual_lr)

    loss_choice = hp.Choice('loss', ['binary_crossentropy', 'focal'],
                            default='binary_crossentropy')
    if loss_choice == 'focal':
        focal_gamma = hp.Float('focal_gamma', 0.5, 3.0, step=0.1, default=2.0)
        loss_fn = tf.keras.losses.BinaryFocalCrossentropy(
            gamma=focal_gamma, from_logits=False
        )
    else:
        loss_fn = 'binary_crossentropy'

    model.compile(
        optimizer=optimizer,
        loss=loss_fn,
        metrics=['binary_accuracy']
    )
    return model


def print_best_hyperparameters(best_hps):
    print("Best Hyperparameters:")
    print(f" - Embedding Type:           pt_fasttext_Embedding")
    print(f" - Recurrent Units (fixed):  25")
    print(f" - First Dense Units:        {best_hps.get('first_dense_units')}")
    print(f" - Second Dense Units:       {best_hps.get('second_dense_units')}")
    print(f" - Optimizer:                {best_hps.get('optimizer')}")
    print(f" - Learning Rate:            {best_hps.get('learning_rate')}")
    print(f" - LR Multiplier:            {best_hps.get('lr_multiplier')}")

    print(f" - RNN Type:                 {best_hps.get('rnn_type')}")
    print(f" - Loss Function:            {best_hps.get('loss')}")
    if best_hps.get('loss') == 'focal':
        print(f"    • Focal Gamma:           {best_hps.get('focal_gamma')}")

In [7]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [8]:
X_train = train_data
X_val = val_data
X_test = test_data

Y_train = train_target
Y_val = val_target
Y_test = test_target

In [9]:
tuner = kt.Hyperband(
    build_model,
    objective="binary_accuracy",
    max_epochs=20,
    factor=3,
    directory='my_dir',
    project_name='nn_multilabel'
)

In [14]:
tuner.search(X_train, Y_train, epochs=20, validation_data=(
    X_test, Y_test), callbacks=[stop_early])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.get_best_models(num_models=1)[0]

print_best_hyperparameters(best_hps)

Best Hyperparameters:
 - Embedding Type:           pt_fasttext_Embedding
 - Recurrent Units (fixed):  25
 - First Dense Units:        32
 - Second Dense Units:       272
 - Optimizer:                adam
 - Learning Rate:            0.0054986817452001
 - LR Multiplier:            1.2170614207934312
 - RNN Type:                 LSTM
 - Loss Function:            focal
    • Focal Gamma:           1.5


# Validation


In [15]:
class val_accuracy_Callback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_binary_accuracy') >= 0.94:
            self.model.stop_training = True

In [16]:
val_acc_Callback = val_accuracy_Callback()

In [17]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_binary_accuracy',
    patience=10,
    verbose=1,
    mode='max',
    restore_best_weights=True
)

In [18]:
best_model.fit(X_train, Y_train, epochs=50, validation_data=(
    X_test, Y_test), callbacks=[val_acc_Callback, early_stop])

Epoch 1/50
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - binary_accuracy: 0.9904 - loss: 0.0111 - val_binary_accuracy: 0.9352 - val_loss: 0.1220
Epoch 2/50
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - binary_accuracy: 0.9935 - loss: 0.0074 - val_binary_accuracy: 0.9367 - val_loss: 0.1466
Epoch 3/50
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - binary_accuracy: 0.9957 - loss: 0.0051 - val_binary_accuracy: 0.9364 - val_loss: 0.1637
Epoch 4/50
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - binary_accuracy: 0.9936 - loss: 0.0076 - val_binary_accuracy: 0.9341 - val_loss: 0.1714
Epoch 5/50
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - binary_accuracy: 0.9940 - loss: 0.0073 - val_binary_accuracy: 0.9391 - val_loss: 0.1547
Epoch 6/50
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - binary_accuracy: 0.9939 -

<keras.src.callbacks.history.History at 0x21929e089b0>

In [19]:
best_model.evaluate(X_test, Y_test)

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - binary_accuracy: 0.9322 - loss: 0.1786


[0.1643703132867813, 0.9392739534378052]

In [20]:
best_model.summary()

In [21]:
best_model.save("./model.keras")

# Inference


In [22]:
sentences = ["DBS app is soo good!!", "Give me back my moneyy 🔥🔥", "They stole my money",
             "They are generous", "So many promo", "The app is so bad!!!", "app is terrible"]

In [23]:
cleaned_sentences = pd.DataFrame(sentences, columns=["sentence"]).apply(
    complete_clean, axis=1).values

  char = string[i]


In [24]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.word_index = word_index

In [25]:
padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(
    cleaned_sentences.tolist()), truncating="post", padding='post').astype(float, copy=False)

In [26]:
label_map = {
    0: "Negative",
    1: "Neutral",
    2: "Positive",
    3: "Very Negative",
    4: "Very Positive",
}

In [27]:
results = np.argmax(best_model.predict(padded_sentences), axis=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 548ms/step


In [28]:
for i in range(len(sentences)):
    print(f"{sentences[i]}: {label_map[results[i]]}")

DBS app is soo good!!: Positive
Give me back my moneyy 🔥🔥: Neutral
They stole my money: Neutral
They are generous: Neutral
So many promo: Neutral
The app is so bad!!!: Neutral
app is terrible: Negative
