In [1]:
# Suppress tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import tensorflow as tf
import random
import shutil

In [3]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
file_path = tf.keras.utils.get_file(
    origin=URL,
    extract=True,
    cache_dir="."
)
print(file_path)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
./datasets/aclImdb_v1.tar.gz


In [4]:
!ls ./datasets/aclImdb/

imdbEr.txt  imdb.vocab	README	test  train


In [5]:
os.makedirs("./datasets/aclImdb/val/pos",  exist_ok=True)
os.makedirs("./datasets/aclImdb/val/neg",  exist_ok=True)

In [6]:
neg_reviews = os.listdir("./datasets/aclImdb/test/neg")
pos_reviews = os.listdir("./datasets/aclImdb/test/pos")
random.shuffle(neg_reviews)
random.shuffle(pos_reviews)

assert len(neg_reviews) == 12500 and len(pos_reviews) == 12500
for file_name in neg_reviews[:7500]:
    shutil.move(
        os.path.join("./datasets/aclImdb/test/neg", file_name),
        "./datasets/aclImdb/val/neg"
    )
    
for file_name in pos_reviews[:7500]:
    shutil.move(
        os.path.join("./datasets/aclImdb/test/pos", file_name),
        "./datasets/aclImdb/val/pos"
    )

In [7]:
!ls ./datasets/aclImdb/train

labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [8]:
def create_dataset_1(base_dir):
    # Approach 1: read all reviews into a list and use from_tensor_slices.
    # base_dir: directory name like "./datasets/aclImdb/train"
    reviews = []
    sentiments = []
    for sentiment in ["pos", "neg"]:
        directory = os.path.join(base_dir, sentiment)
        for file_path in os.listdir(directory):
            with open(os.path.join(directory, file_path)  ,"r") as file:
                reviews.append(file.readlines())
            sentiments.append(1.0 if sentiment == "pos" else 0.0)
            
    return tf.data.Dataset.from_tensor_slices( (reviews, sentiments))

In [9]:
def create_dataset_2(base_dir):
    pos_file_paths = [os.path.join(base_dir, "pos", f)
        for f in os.listdir(os.path.join(base_dir, "pos"))]
    neg_file_paths = [os.path.join(base_dir, "neg", f)
        for f in os.listdir(os.path.join(base_dir, "neg"))]
    
    pos_ds = tf.data.TextLineDataset(
        pos_file_paths, num_parallel_reads=4).map(
        lambda review : (review, 1.0)
    )
    
    neg_ds = tf.data.TextLineDataset(
        neg_file_paths, num_parallel_reads=4).map(
        lambda review : (review, 0.0)    
    )
    
    return pos_ds.concatenate(neg_ds)

In [10]:
ds = create_dataset_2("./datasets/aclImdb/train")
for review, sentiment in ds.skip(12499).take(2):
    print(review)
    print(sentiment)
    print("*"*50)

tf.Tensor(b'The story centers around Barry McKenzie who must go to England if he wishes to claim his inheritance. Being about the grossest Aussie shearer ever to set foot outside this great Nation of ours there is something of a culture clash and much fun and games ensue. The songs of Barry McKenzie(Barry Crocker) are highlights.', shape=(), dtype=string)
tf.Tensor(1.0, shape=(), dtype=float32)
**************************************************
tf.Tensor(b'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary

In [11]:
BATCH_SIZE=512
train_ds = (create_dataset_2("./datasets/aclImdb/train").
            shuffle(buffer_size=15_000, seed=42).
            batch(BATCH_SIZE).
            prefetch(1)
)
val_ds = (create_dataset_2("./datasets/aclImdb/val").
            batch(BATCH_SIZE).
            prefetch(1)
)
test_ds = (create_dataset_2("./datasets/aclImdb/test").
            batch(BATCH_SIZE).
            prefetch(1)
)

In [12]:
VOCAB_SIZE=10_000
multi_hot_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="multi_hot"
)

In [13]:
multi_hot_layer.adapt(train_ds.map(lambda review, sentiment : review))

In [14]:
print(multi_hot_layer.get_vocabulary()[:20])

['[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but', 'film']


In [15]:
multi_hot_layer(["The movie was great and a the the the", "Terrible sjqfljqfsdj"])

<tf.Tensor: shape=(2, 10000), dtype=float32, numpy=
array([[0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [16]:
def get_model(conversion_layer):
    model = tf.keras.Sequential()
    
    model.add(conversion_layer)
    vocab_size = len(conversion_layer.get_vocabulary())
    
    model.add(tf.keras.layers.Dense(
        units=16,
        activation="relu", 
        kernel_initializer="he_uniform",
        input_shape=(vocab_size, )
    ))
    
    model.add(tf.keras.layers.Dense(
        units=16,
        activation="relu", 
        kernel_initializer="he_uniform",       
    ))
    
    model.add(tf.keras.layers.Dense(
        units=1,
        activation="sigmoid",               
    ))
    
    return model

In [17]:
model = get_model(multi_hot_layer)

In [18]:
for x, y in train_ds.take(1):
    print(x.shape)
    print(model(x).shape)

(512,)
(512, 1)


In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10000)            0         
 torization)                                                     
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,305
Trainable params: 160,305
Non-trainable params: 0
_________________________________________________________________


In [20]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=3,
    restore_best_weights=True
)

In [21]:
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [22]:
history = model.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=100,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [23]:
model.evaluate(train_ds)



[0.2502068877220154, 0.9226800203323364]

In [24]:
VOCAB_SIZE=10_000
tf_idf_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="tf_idf"
)
tf_idf_layer.adapt(train_ds.map(lambda review, sentiment : review))

In [25]:
model2 = get_model(tf_idf_layer)

In [26]:
model2.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = model2.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=100,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [27]:
tf_idf_layer(["The movie was great and a the the the", "Terrible sjqfljqfsdj"])

<tf.Tensor: shape=(2, 10000), dtype=float32, numpy=
array([[0.       , 2.7894142, 0.7110562, ..., 0.       , 0.       ,
        0.       ],
       [5.562553 , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]], dtype=float32)>

In [28]:
model2.evaluate(train_ds)



[0.1636025756597519, 0.9486799836158752]

In [29]:
VOCAB_SIZE=10_000
int_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int"
)
int_layer.adapt(train_ds.map(lambda review, sentiment : review))

In [30]:
int_layer(["It was a terrible movie", "Super!"])

<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[   9,   14,    4,  384,   18],
       [1566,    0,    0,    0,    0]])>

In [32]:
int_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

In [31]:
embed_layer= tf.keras.layers.Embedding(
    input_dim=len(int_layer.get_vocabulary()),
    output_dim=3)
embed_layer(int_layer(["It was a terrible movie", "Super!"]))

<tf.Tensor: shape=(2, 5, 3), dtype=float32, numpy=
array([[[ 0.006118  ,  0.01008576, -0.02373931],
        [ 0.03846511, -0.02238265,  0.0028962 ],
        [ 0.04528674,  0.03774228, -0.02687869],
        [-0.01549911,  0.03653402, -0.03963671],
        [ 0.02001765, -0.04948144, -0.02445948]],

       [[-0.0430169 , -0.00814094, -0.01168038],
        [ 0.01671696,  0.04243027, -0.04490953],
        [ 0.01671696,  0.04243027, -0.04490953],
        [ 0.01671696,  0.04243027, -0.04490953],
        [ 0.01671696,  0.04243027, -0.04490953]]], dtype=float32)>

In [43]:
# Scaling
tf.math.sqrt(tf.math.count_nonzero(
    int_layer(["It was a terrible movie", "Super!"]),
    axis=-1,
    keepdims=True,
    dtype=tf.float32)
)

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.2360678],
       [1.       ]], dtype=float32)>

In [51]:
# vermenigvuldig met multiplier
tf.expand_dims(
    tf.where(int_layer(["It was a terrible movie", "Super!"]) != 0, 1.0 ,0.0),
    -1)  * embed_layer(int_layer(["It was a terrible movie", "Super!"]))

<tf.Tensor: shape=(2, 5, 3), dtype=float32, numpy=
array([[[ 0.006118  ,  0.01008576, -0.02373931],
        [ 0.03846511, -0.02238265,  0.0028962 ],
        [ 0.04528674,  0.03774228, -0.02687869],
        [-0.01549911,  0.03653402, -0.03963671],
        [ 0.02001765, -0.04948144, -0.02445948]],

       [[-0.0430169 , -0.00814094, -0.01168038],
        [ 0.        ,  0.        , -0.        ],
        [ 0.        ,  0.        , -0.        ],
        [ 0.        ,  0.        , -0.        ],
        [ 0.        ,  0.        , -0.        ]]], dtype=float32)>

In [55]:
0.006118 + 0.03846511 + 0.04528674 + (-0.01549911) +  0.02001765

0.09438839

In [54]:
tf.reduce_sum(tf.expand_dims(
    tf.where(int_layer(["It was a terrible movie", "Super!"]) != 0, 1.0 ,0.0),
    -1)  * embed_layer(int_layer(["It was a terrible movie", "Super!"])),
              axis=1
)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 0.09438838,  0.01249797, -0.11181799],
       [-0.0430169 , -0.00814094, -0.01168038]], dtype=float32)>

In [61]:
class MeanEmbeddingLayer(tf.keras.layers.Layer):
    
    def __init__(self, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embedding_layer = tf.keras.layers.Embedding(
            input_dim=self.input_dim,
            output_dim=self.output_dim)
        
    def call(self, ints):
        ## ints should be (batch_size, max_sequence_length)
        
        # (batch, max_sequence_length, 1)
        multiplier = tf.expand_dims(tf.where(ints != 0, 1.0 ,0.0), -1) 
        
        # (batch, 1)
        scaling_factor = tf.math.sqrt(tf.math.count_nonzero(
            ints,
            axis=-1,
            keepdims=True,
            dtype=tf.float32)
        )
        
        # (batch, max_seq_length, self.output_dim)
        embeddings = self.embedding_layer(ints)
        
        # (batch, self.output_dim)
        unscaled_sum = tf.reduce_sum(multiplier * embeddings, axis=1)
        
        return unscaled_sum / scaling_factor
        
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, 
                "input_dim": self.input_dim,
                "output_dim": self.output_dim}

In [62]:
mean_embed_layer = MeanEmbeddingLayer(
    input_dim=len(int_layer.get_vocabulary()),
    output_dim=3
)

In [63]:
mean_embed_layer(int_layer(["It was a terrible movie", "Super!"]))

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 0.00643829, -0.00560611, -0.01583122],
       [-0.00760835, -0.02578654, -0.02047758]], dtype=float32)>

In [69]:
def get_model_with_embedding(vectorization_layer, output_dim):
    
    model = tf.keras.Sequential()
    
    model.add(vectorization_layer)
    
    model.add(MeanEmbeddingLayer(
        input_dim=len(vectorization_layer.get_vocabulary()),
        output_dim=output_dim
    ))
    
    model.add(tf.keras.layers.Dense(
        units=16,
        activation="relu", 
        kernel_initializer="he_uniform",        
    ))
    
    model.add(tf.keras.layers.Dense(
        units=16,
        activation="relu", 
        kernel_initializer="he_uniform",       
    ))
    
    model.add(tf.keras.layers.Dense(
        units=1,
        activation="sigmoid",               
    ))
    
    return model

In [73]:
model3 = get_model_with_embedding(int_layer, output_dim=16)

In [74]:
for X, y in train_ds.take(1):
    print(X.shape)
    print(model3(X).shape)

(512,)
(512, 1)


In [75]:
model3.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 mean_embedding_layer_4 (Mea  (None, 16)               160000    
 nEmbeddingLayer)                                                
                                                                 
 dense_9 (Dense)             (None, 16)                272       
                                                                 
 dense_10 (Dense)            (None, 16)                272       
                                                                 
 dense_11 (Dense)            (None, 1)                 17        
                                                                 
Total params: 160,561
Trainable params: 160,561
Non-tr

In [76]:
model3.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = model3.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=100,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [77]:
model3.evaluate(val_ds)



[0.27490878105163574, 0.885533332824707]

In [78]:
model3.evaluate(test_ds)



[0.2677277624607086, 0.8881000280380249]