In [1]:
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    local_file = local_file.replace("%20", " ")
    p = tf.keras.utils.get_file(local_file, url, extract = True, cache_dir = ".")
    local_folder = os.path.join("datasets", local_file.split('.')[0])
    labeled_sentences = []
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith("_labelled.txt"):
            with open(os.path.join(local_folder, labeled_filename), "r") as f:
                for line in f:
                    sentence, label = line.strip().split("\t")
                    labeled_sentences.append((sentence, label))
    return labeled_sentences

labeled_sentences = download_and_read("https://archive.ics.uci.edu/ml/machine-learning-databases/" +
 "00331/sentiment%20labelled%20sentences.zip")
sentences = [s for (s,l) in labeled_sentences]
labels = [int(l) for (s,l) in labeled_sentences]

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_counts)

print("Vocabulary size: {:d}".format(vocab_size))

word2idx = tokenizer.word_index
idx2word = {v:w for (w,v) in word2idx.items()}

Vocabulary size: 5271


In [4]:
"""An easy way to choose a good value for the maximum
sequence length is to look at the sentence length (in number of words) at different
percentile positions:
"""

seq_lengths = [len(s.split()) for s in sentences]
print([(p, np.percentile(seq_lengths, p)) for p in [75, 80, 90, 99, 100]])

[(75, 16.0), (80, 18.0), (90, 22.0), (99, 36.0), (100, 71.0)]


In [5]:
max_seqlen = 64
sentences_to_ints = tokenizer.texts_to_sequences(sentences)
sentences_to_ints = tf.keras.preprocessing.sequence.pad_sequences(sentences_to_ints, maxlen = max_seqlen)
labels_as_ints = np.array(labels)
dataset = tf.data.Dataset.from_tensor_slices((sentences_to_ints, labels_as_ints))

In [6]:
dataset = dataset.shuffle(10000)
test_size = len(sentences)//3
val_size = (len(sentences) - test_size)//10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

batch_size = 64
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [9]:
for x, y in train_dataset.take(1):
    print(x.shape,y.shape)

(64, 64) (64,)


In [10]:
class SentimentAnalysisModel(tf.keras.Model):
    def __init__(self, max_seqlen, embedding_dim, vocab_sz, **kwargs):
        super(SentimentAnalysisModel, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(
            vocab_sz,
            embedding_dim
            )
        self.bilstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(max_seqlen)
        )
        self.dense = tf.keras.layers.Dense(64, activation = 'relu')
        self.out = tf.keras.layers.Dense(1, activation = 'sigmoid')
        
    def call(self, x):
        x = self.embedding(x)
        x = self.bilstm(x)
        x = self.dense(x)
        x = self.out(x)
        return x

In [11]:
EMBEDDING_DIM = 256
model = SentimentAnalysisModel(max_seqlen,EMBEDDING_DIM,vocab_size+1)
model.build(input_shape = (batch_size, max_seqlen))
model.summary()

Model: "sentiment_analysis_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  1349632   
_________________________________________________________________
bidirectional (Bidirectional multiple                  164352    
_________________________________________________________________
dense (Dense)                multiple                  8256      
_________________________________________________________________
dense_1 (Dense)              multiple                  65        
Total params: 1,522,305
Trainable params: 1,522,305
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss = "binary_crossentropy", optimizer = 'adam', metrics = ['accuracy'])

In [14]:
data_dir = "./data"
logs_dir = "./logs"
best_model_file = os.path.join(data_dir, "best_model.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file, save_weights_only = True,
    save_best_only = True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir = logs_dir)
num_epochs = 10
history = model.fit(train_dataset, epochs = num_epochs, validation_data = val_dataset,
    callbacks = [checkpoint, tensorboard])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
best_model = SentimentAnalysisModel(max_seqlen,EMBEDDING_DIM,vocab_size+1)
best_model.build(input_shape=(batch_size, max_seqlen))
best_model.load_weights(best_model_file)
best_model.compile(
 loss="binary_crossentropy",
 optimizer="adam",
 metrics=["accuracy"]
)

In [16]:
test_loss, test_acc = best_model.evaluate(test_dataset)
print("test loss: {:.3f}, test accuracy: {:.3f}".format(
 test_loss, test_acc))

test loss: 0.017, test accuracy: 0.996


In [20]:
labels, predictions = [], []
idx2word[0] = "PAD"
is_first = True
print("LBL\tPRED\tSENT")
for test_batch in test_dataset:
    inputs_b, labels_b = test_batch
    pred_batch = best_model.predict(inputs_b)
    predictions.extend([(1 if p > 0.5 else 0) for p in pred_batch])
    labels.extend([l for l in labels_b])
    if is_first:
        for i in range(inputs_b.shape[0]-50):
            words = [idx2word[idx] for idx in inputs_b[i].numpy()]
            words = [w for w in words if w != "PAD"]
            sentences = " ".join(words)
            print("{:d}\t{:d}\t{:s}".format(labels[i], predictions[i], sentences))
        is_first = False

LBL	PRED	SENT
0	0	overall i was not impressed and would not go back
1	1	back to good bbq lighter fare reasonable pricing and tell the public they are back to the old ways
0	0	first of all it doesn't wear well
0	0	i will not return
0	0	so don't go there if you are looking for good food
0	0	not recommended
1	1	the fact is this film is a wonderful heartwarming tale about two people chasing their dreams
1	1	there still are good actors around
1	1	julian fellowes has triumphed again
0	0	no one at the table thought the food was above average or worth the wait that we had for it
0	0	they refuse to refund or replace
1	1	really really good rice all the time
0	0	but she is still a bad actress repeating her robotic face moves in each of her pictures
1	1	i received my headset in good time and was happy with it


In [21]:
print("accuracy score: {:.3f}".format(accuracy_score(labels, predictions)))

accuracy score: 0.998


In [22]:
print("confusion matrix")
print(confusion_matrix(labels, predictions))

confusion matrix
[[507   1]
 [  1 491]]
