In [12]:
# conda activate tf_p39
import numpy as np
import os
import shutil
import tensorflow as tf

from sklearn.metrics import accuracy_score, confusion_matrix

In [13]:
tf.get_logger().setLevel('ERROR')

In [14]:
def clean_logs(data_dir):
    logs_dir = os.path.join(data_dir, "logs")
    shutil.rmtree(logs_dir, ignore_errors=True)
    return logs_dir

In [15]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    local_file = local_file.replace("%20", " ")
    p = tf.keras.utils.get_file(local_file, url, 
        extract=True, cache_dir=".")
    local_folder = os.path.join("datasets", local_file.split('.')[0])
    labeled_sentences = []
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith("_labelled.txt"):
            with open(os.path.join(local_folder, labeled_filename), "r") as f:
                for line in f:
                    sentence, label = line.strip().split('\t')
                    labeled_sentences.append((sentence, label))
    return labeled_sentences

In [16]:
# SentimentAnalysisModel(vocab_size=5271, max_seqlen=64)
class SentimentAnalysisModel(tf.keras.Model):
    def __init__(self, vocab_size, max_seqlen, **kwargs):
        super(SentimentAnalysisModel, self).__init__(**kwargs)
        """
        input_dim: Integer. Size of the vocabulary, i.e. maximum integer index + 1.
        output_dim: Integer. Dimension of the dense embedding.
        """
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=max_seqlen)
        """
        - units: Positive integer, dimensionality of the output space.
        """
        self.bilstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(units=max_seqlen)
        )
        """
        - units: Positive integer, dimensionality of the output space.
        """
        self.dense = tf.keras.layers.Dense(units=64, activation="relu")
        self.out = tf.keras.layers.Dense(units=1, activation="sigmoid")

    def call(self, x):
        x = self.embedding(x)
        x = self.bilstm(x)
        x = self.dense(x)
        x = self.out(x)
        return x

In [17]:
# set random seed
tf.random.set_seed(42)

# clean up log area
data_dir = "./data"
logs_dir = clean_logs(data_dir)

In [18]:
# download and read data into data structures
labeled_sentences = download_and_read(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip")
sentences = [s for (s, l) in labeled_sentences]
labels = [int(l) for (s, l) in labeled_sentences]

In [19]:
# labeled_sentences
# sentences
# labels

In [20]:
# tokenize sentences: Convert words into numbers
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_counts)
print("vocabulary size: {:d}".format(vocab_size))

word2idx = tokenizer.word_index
idx2word = {v:k for (k, v) in word2idx.items()}

vocabulary size: 5271


In [21]:
# seq_lengths = np.array([len(s.split()) for s in sentences])
# print([(p, np.percentile(seq_lengths, p)) for p 
#     in [75, 80, 90, 95, 99, 100]])
# [(75, 16.0), (80, 18.0), (90, 22.0), (95, 26.0), (99, 36.0), (100, 71.0)]
max_seqlen = 64


In [22]:
# create dataset
# Each sequence has as its lenght the number of words.
sentences_as_ints = tokenizer.texts_to_sequences(sentences)
# All sequences has lenght max_seqlen.
sentences_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    sentences_as_ints, maxlen=max_seqlen)
labels_as_ints = np.array(labels)
dataset = tf.data.Dataset.from_tensor_slices(
    (sentences_as_ints, labels_as_ints))

2022-12-21 04:42:46.451885: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
# split into train and test
dataset = dataset.shuffle(10000)
test_size = len(sentences) // 3
val_size = (len(sentences) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)
print(train_dataset)

batch_size = 64
train_dataset = train_dataset.batch(batch_size)
print(train_dataset)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

<SkipDataset shapes: ((64,), ()), types: (tf.int32, tf.int64)>
<BatchDataset shapes: ((None, 64), (None,)), types: (tf.int32, tf.int64)>


In [24]:
# define model
# vocab_size + 1 to account for PAD character
model = SentimentAnalysisModel(vocab_size+1, max_seqlen)
model.build(input_shape=(batch_size, max_seqlen))
model.summary()

Model: "sentiment_analysis_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  337408    
                                                                 
 bidirectional (Bidirectiona  multiple                 66048     
 l)                                                              
                                                                 
 dense (Dense)               multiple                  8256      
                                                                 
 dense_1 (Dense)             multiple                  65        
                                                                 
Total params: 411,777
Trainable params: 411,777
Non-trainable params: 0
_________________________________________________________________


In [25]:
# compile
model.compile(
    loss="binary_crossentropy",
    optimizer="adam", 
    metrics=["accuracy"]
)


In [26]:
# train
best_model_file = os.path.join(data_dir, "best_model.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file,
    save_weights_only=True,
    save_best_only=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)
num_epochs = 10
print(train_dataset)
history = model.fit(train_dataset, epochs=num_epochs, 
    validation_data=val_dataset,
    callbacks=[checkpoint, tensorboard])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
# evaluate with test set
best_model = SentimentAnalysisModel(vocab_size+1, max_seqlen)
best_model.build(input_shape=(batch_size, max_seqlen))
best_model.load_weights(best_model_file)
best_model.compile(
    loss="binary_crossentropy",
    optimizer="adam", 
    metrics=["accuracy"]
)

test_loss, test_acc = best_model.evaluate(test_dataset)
print("test loss: {:.3f}, test accuracy: {:.3f}".format(test_loss, test_acc))

test loss: 0.047, test accuracy: 0.985


In [28]:
# predict on batches
labels, predictions = [], []
idx2word[0] = "PAD"
is_first_batch = True
for test_batch in test_dataset:
    inputs_b, labels_b = test_batch
    pred_batch = best_model.predict(inputs_b)
    predictions.extend([(1 if p > 0.5 else 0) for p in pred_batch])
    labels.extend([l for l in labels_b])
    if is_first_batch:
        for rid in range(inputs_b.shape[0]):
            words = [idx2word[idx] for idx in inputs_b[rid].numpy()]
            words = [w for w in words if w != "PAD"]
            sentence = " ".join(words)
            print("{:d}\t{:d}\t{:s}".format(labels[rid], predictions[rid], sentence))
        is_first_batch = False

0	0	this is a chilly unremarkable movie about an author living working in a chilly abstruse culture
0	0	the worst piece of crap ever along with the verizon customer service
1	1	it's a gloriously fun fast paced and fairly accurate portrayal of the night of a raver
0	0	bad characters bad story and bad acting
1	1	i enjoyed reading this book to my children when they were little
1	1	i own 2 of these cases and would order another
1	1	we would recommend these to others
1	1	the selection on the menu was great and so were the prices
1	1	gets the job done
1	1	tom wilkinson broke my heart at the end and everyone else's judging by the amount of fumbling for hankies and hands going up to faces among males and females alike
1	1	mark my words this is one of those cult films like evil dead 2 or phantasm that people will still be discovering and falling in love with 20 30 40 years down the line
1	1	the film's dialogue is natural real to life
1	1	high quality chicken on the chicken caesar salad
1	1	work

In [29]:
print("accuracy score: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))


accuracy score: 0.980
confusion matrix
[[482  14]
 [  6 498]]
