In [10]:
import os, pathlib, shutil, random
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir / 'val'
train_dir = base_dir / 'train'

In [11]:
train_dir

WindowsPath('aclImdb/train')

In [12]:
# Take 0.2 from the training file to a new path called val in the main folder
for category in ('neg', 'pos'):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [14]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    directory='aclImdb/train',
    batch_size=batch_size
)

val_ds = keras.utils.text_dataset_from_directory(
    directory='aclImdb/val',
    batch_size=batch_size
)

test_ds = keras.utils.text_dataset_from_directory(
    directory='aclImdb/test',
    batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [17]:
for inputs, targets in train_ds:
    print(f"input shape is {inputs.shape}")
    print(f"input dtype is {inputs.dtype}")
    print(f"target shape is {targets.shape}")
    print(f"target dtype is {targets.dtype}")
    print(f"input[0] is {inputs[0]}")
    print(f"target[0] is {targets[0]}")
    break

input shape is (32,)
input dtype is <dtype: 'string'>
target shape is (32,)
target dtype is <dtype: 'int32'>
input[0] is b"Bela Lugosi as creepy insane scientist who uses orchids to woo brides in order to steal life essence for aged wife. The midget in this film is hilarious!! A lot of freaks, plus a lot of padding and no plot makes watching this film a nightmare. I loved how all the pieces fell together in the end in typical Hollywood fashion. The story never gets interesting, and you feel helpless as you watch.<br /><br />Usually I'd score bore flicks like this one low, but the midget added just enough creepiness and entertainent to gain a couple more points."
target[0] is 0


In [19]:
from keras.layers import TextVectorization
text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode='multi_hot'
)

In [20]:
text_only_train_ds = train_ds.map(lambda x, y:x)
text_vectorization.adapt(text_only_train_ds)

In [22]:
# Tokenization all the inputs text inside the datasets
binary_1gram_train_ds = train_ds.map(lambda x, y:(text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(lambda x, y:(text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(lambda x, y:(text_vectorization(x), y), num_parallel_calls=4)

In [23]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


# Model 1: Dense model - unigram

In [25]:
from tensorflow import keras
from keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = layers.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.models.Model(inputs, outputs)

    model.compile(
        loss=keras.losses.BinaryCrossentropy(),
        optimizer='rmsprop',
        metrics=['accuracy']
    )
    return model

In [27]:
model_1 = get_model()
model_1.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [None]:
callbacks = [keras.callbacks.ModelCheckpoint('imdb text classification/model_1_dense', save_best_only=True)]
model_1.fit(
    binary_1gram_train_ds.cache(),
    validation_data=binary_1gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

In [29]:
model_1.evaluate(binary_1gram_val_ds)



[0.42284566164016724, 0.876800000667572]

In [34]:
model_1 = keras.models.load_model('imdb text classification models/model_1_dense')
accuracy_model_1 = model_1.evaluate(binary_1gram_val_ds)[1]
accuracy_model_1



0.879800021648407

# model 2: dense model - bigram

In [None]:
# use bigram text vectorization ande create model 2 with 2gram
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='multi_hot'
)
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(
    lambda x, y:(text_vectorization(x), y), num_parallel_calls=4
)
binary_2gram_val_ds = val_ds.map(
    lambda x, y:(text_vectorization(x), y), num_parallel_calls=4
)
binary_2gram_test_ds = test_ds.map(
    lambda x, y:(text_vectorization(x), y), num_parallel_calls=4
)

model_2 = get_model()
callbacks = [keras.callbacks.ModelCheckpoint('imdb text classification models/model_2_dense_bigram', save_best_only=True)]

model_2.fit(
    binary_2gram_train_ds.cache(),
    validation_data=binary_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

In [32]:
model_2.evaluate(binary_2gram_val_ds)



[0.4033440053462982, 0.8888000249862671]

In [35]:
model_2 = keras.models.load_model('imdb text classification models/model_2_dense_bigram')
accuracy_model_2 = model_2.evaluate(binary_2gram_val_ds)[1]
accuracy_model_2



0.8962000012397766

# model 3: dense model - bigram - output_mode = count (TF_IDF)

In [36]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='tf_idf'
)
text_vectorization.adapt(text_only_train_ds)

In [37]:
tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)

In [None]:
model_3 = get_model()
callbacks = [keras.callbacks.ModelCheckpoint('imdb text classification models/model_3_dense_tfidf', save_best_only=True)]

model_3.fit(
    tfidf_2gram_train_ds.cache(),
    validation_data=tfidf_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

In [40]:
model_3.evaluate(tfidf_2gram_val_ds)



[0.3734765946865082, 0.8880000114440918]

In [42]:
model_3 = keras.models.load_model('imdb text classification models/model_3_dense_tfidf')
accuracy_model_3 = model_3.evaluate(tfidf_2gram_val_ds)[1]
accuracy_model_3



0.8885999917984009

# Exporting a model that process raw string

In [43]:
import tensorflow as tf
inputs = layers.Input(shape=(1,), dtype=tf.string)
processed_input = text_vectorization(inputs)
outputs = model_3(processed_input)
inference_model = keras.models.Model(inputs, outputs)

In [97]:
raw_text = "That was an excellent movie, I loved it."
processed_raw_text = tf.convert_to_tensor([[raw_text]])
predictions = inference_model(processed_raw_text)
print(f"[{float(predictions[0] * 100):.2f}] percent positive")

[89.67] percent positive


In [96]:
# def prediction():
#     global text_vectorization, inference_model
#     raw_text = input_text.get()
#     processed_text = text_vectorization(raw_text)
#     prediction = inference_model(processed_text)
#     pred_display.config(text=f"[{prediction[0]} Percent Positive]", font=('Aerial', 12))
#
#
#
# from tkinter import *
# window = Tk('sentiment')
# window.title('Sentiment')
# window.minsize(width=500, height=414)
# window.config(padx=20, pady=20)
#
#
# label_1 = Label(text="Insert text:", font=('Aerial', 12))
# label_1.grid(row=1, column=0, columnspan=2)
# label_1.config(padx=0, pady=0)
#
# input_text = Entry()
# input_text.grid(row=2, column=0, columnspan=2)
# input_text.config(width=60)
#
# pred_display = Label()
# pred_display.grid(row=3, column=0, columnspan=2)
# pred_display.config(padx=2, pady=2, width=60)
#
# button = Button(text='Check', command=prediction)
# button.grid(row=4, column=0, columnspan=2)
# button.config(width=50, pady=0, padx=0)
#
# window.mainloop()

# Fundamental of sequence model

In [98]:
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_length
)

text_vectorization.adapt(text_only_train_ds)

In [99]:
int_train_ds = train_ds.map(
    lambda x, y:(text_vectorization(x), y), num_parallel_calls=4
)
int_val_ds = val_ds.map(
    lambda x, y:(text_vectorization(x), y), num_parallel_calls=4
)
int_test_ds = test_ds.map(
    lambda x, y:(text_vectorization(x), y), num_parallel_calls=4
)

In [None]:
# inputs = layers.Input(shape=(None,), dtype='int64')
# embedded = tf.one_hot(inputs, depth=max_tokens)
# x = layers.Bidirectional(layers.LSTM(32))(embedded)
# x = layers.Dropout(0.5)(x)
# outputs = layers.Dense(1, activation='sigmoid')(x)
# model_4 = keras.Model(inputs, outputs)
#
# callbacks = [tf.keras.callbacks.ModelCheckpoint('imdb text classification models/model_4_first_sequence_model', save_best_only=True)]
#
# model_4.compile(
#     loss='binary_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )
#
# model_4.fit(
#     int_train_ds,
#     validation_data=int_val_ds,
#     epochs=10,
#     callbacks=callbacks
# )

In [None]:
inputs = layers.Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='relu')(x)
model_4 = keras.models.Model(inputs, outputs)

model_4.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

callbacks = [tf.keras.callbacks.ModelCheckpoint('imdb text classification models/model_4_embedding_simple')]

model_4.fit(
    int_train_ds,
    validation_data=int_val_ds,
    epochs=10,
    callbacks=callbacks
)

NameError: name 'max_length' is not defined