Importing libraries

In [1]:
import keras
import tensorflow as tf
import numpy as np
import sklearn as sk
from keras import layers as L
import os
import string
import re

In [2]:
batch_size = 32

Create raw training, validation and testing datasets

In [3]:
raw_train = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/train",
    "inferred",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=42,
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [4]:
raw_valid = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/train",
    "inferred",
    label_mode="int",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=42
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [5]:
raw_test = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/train",
    batch_size=batch_size,
    seed=42
)

Found 25000 files belonging to 2 classes.


Checking it out

In [6]:
for item, label in raw_train.take(1):
    for i in range(2):
        print(item[i])
        print(label[i])

tf.Tensor(b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)', shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with cha

To remove `<br>` tags (HTML), custom standartization should be created

In [7]:
def custom_std(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"[{re.escape(string.punctuation)}]", "")

In [8]:
max_tokens = 20_000
embDimensions = 128
length = 500

In [9]:
vecL = L.TextVectorization(
    standardize=custom_std,
    split="whitespace",
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=length
)

text_only = raw_train.map(lambda x, y: x)
vecL.adapt(text_only)

Vectorizing the data

In [10]:
def text2vec(text, label):
    text = tf.expand_dims(text, -1)
    return vecL(text), label

In [11]:
train = raw_train.map(text2vec)
valid = raw_valid.map(text2vec)
test = raw_test.map(text2vec)

In [12]:
#Better GPU:
train = train.cache().prefetch(buffer_size=10)
valid = valid.cache().prefetch(buffer_size=10)
test = test.cache().prefetch(buffer_size=10)

Final model

In [13]:
inputs = keras.Input(dtype="int64", shape=(None, ))

'''
model = keras.Model(inputs, y)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
'''

x = L.Embedding(max_tokens, embDimensions)(inputs)
x = L.Dropout(0.5)(x)
x = L.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = L.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = L.GlobalMaxPooling1D()(x)
x = L.Dense(128, activation="relu")(x)
x = L.Dropout(0.5)(x)

predictions = L.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

In [14]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Training and validation

In [15]:
epochs = 3

In [16]:
model.fit(train, validation_data=valid, epochs=epochs)

Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 77ms/step - accuracy: 0.5854 - loss: 0.6263 - val_accuracy: 0.8698 - val_loss: 0.3212
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 61ms/step - accuracy: 0.8882 - loss: 0.2821 - val_accuracy: 0.8704 - val_loss: 0.3306
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 60ms/step - accuracy: 0.9483 - loss: 0.1460 - val_accuracy: 0.8742 - val_loss: 0.4314


<keras.src.callbacks.history.History at 0x26617eee720>

In [17]:
model.evaluate(test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.9820 - loss: 0.0575


[0.1163443848490715, 0.9656400084495544]

In [50]:
def owntext(text):
    text = tf.expand_dims(text, -1)
    return vecL(text)

In [80]:
str = owntext("Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in.")
str
round(model.predict(str)[0][0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


0

In [82]:
def predict_sentiment(model, text):
    text = [text]
    text = tf.expand_dims(text, -1)
    text = vecL(text)
    prediction = model.predict(text)
    return prediction[0][0]

text = "fuck this movie, will never watch it again 2/10"
sentiment_score = predict_sentiment(model, text)
print(f'Sentiment score for the text: {sentiment_score}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Sentiment score for the text: 0.2615630030632019


Adding LSTM (Bidirectional)

In [85]:
inputs2 = keras.Input(shape=(None,), dtype="int32")

x2 = L.Embedding(max_tokens, 128)(inputs2)
x2 = L.Bidirectional(L.LSTM(64, return_sequences=True))(x2)
x2 = L.Bidirectional(L.LSTM(64))(x2)
outputs2 = L.Dense(1, activation="sigmoid")(x2)

model2 = keras.Model(inputs2, outputs2)


In [86]:
model2.summary()

In [88]:
model2.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [91]:
model2.fit(train, validation_data=valid, batch_size=32, epochs=2)

Epoch 1/2
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 389ms/step - accuracy: 0.6817 - loss: 0.5816 - val_accuracy: 0.5326 - val_loss: 0.8761
Epoch 2/2
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m413s[0m 661ms/step - accuracy: 0.8401 - loss: 0.3843 - val_accuracy: 0.8652 - val_loss: 0.3507


<keras.src.callbacks.history.History at 0x26617f740b0>

In [93]:
predict_sentiment(model2, "so bad movie I have never watched something like this, creator is a dumb idiot")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step


0.3128151