# Word Embeddings

In [None]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from datetime import datetime
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

%load_ext tensorboard

## Download Data

In [6]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [8]:
train_dir = os.path.join(dataset_dir, "train")
os.listdir(train_dir)

['urls_unsup.txt',
 'neg',
 'urls_pos.txt',
 'unsup',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [10]:
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

## Load Dataset

In [7]:
batch_size = 1024
seed = 123

In [8]:
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
            "aclImdb/train", batch_size=batch_size, validation_split=0.2,
            subset="training", seed=seed)

val_ds = tf.keras.preprocessing.text_dataset_from_directory("aclImdb/train",
                                                           batch_size=batch_size, validation_split=0.2,
                                                           subset="validation", seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [6]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i], end='\n')
        print(" ")

1 b"The original animated Dark Knight returns in this ace adventure movie that rivals Mask of Phantasm in its coolness. There's a lot of style and intelligence in Mystery of the Batwoman, so much more than Batman Forever or Batman and Robin.<br /><br />There's a new crime-fighter on the streets of Gotham. She dresses like a bat but she's not a grown-up Batgirl. And Batman is denying any affiliation with her. Meanwhile Bruce Wayne has to deal with the usual romances and detective work. But the Penguin, Bain and the local Mob makes things little more complicated.<br /><br />I didn't have high hopes for this 'un since being strongly let down but the weak Batman: Sub Zero (Robin isn't featured so much here!)but I was delighted with the imaginative and exciting set pieces, the clever plot and a cheeky sense of humor. This is definitely a movie no fan of Batman should be without. Keep your ears open for a really catchy song called 'Betcha Neva' which is featured prominently through-out.<br /

In [14]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Embedding Layer Example

In [9]:
embedding_layer = tf.keras.layers. Embedding(1000, 5)

In [10]:
result = embedding_layer(tf.constant([1,2,3]))
result.numpy()

array([[-0.02723383, -0.04862229, -0.03262519,  0.03796724, -0.01996624],
       [ 0.02354953, -0.0430707 , -0.01734068,  0.00312878, -0.02443675],
       [ 0.00325755, -0.0085156 , -0.04519049,  0.00444851, -0.00714653]],
      dtype=float32)

In [11]:
result = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
result.shape

TensorShape([2, 3, 5])

## Text Preprocessing

Here we tokenize the text dataset.

In [13]:
def custom_standarization(input_data):
    
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, ' <br />', '')
    return tf.strings.regex_replace(stripped_html, '[%s]'%re.escape(string.punctuation), '')

In [14]:
vocab_size = 10000
sequence_length = 100

In [15]:
vectorize_layer = TextVectorization(standardize=custom_standarization,
                                   max_tokens=vocab_size, output_mode="int",
                                   output_sequence_length=sequence_length)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [20]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'was',
 'as',
 'br',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'who',
 'from',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'what',
 'good',
 'when',
 'more',
 'very',
 'even',
 'she',
 'my',
 'up',
 'no',
 'would',
 'only',
 'which',
 'time',
 'really',
 'story',
 'their',
 'see',
 'were',
 'had',
 'can',
 'me',
 'we',
 'than',
 'much',
 'well',
 'been',
 'will',
 'get',
 'also',
 'people',
 'into',
 'other',
 'do',
 'first',
 'bad',
 'great',
 'because',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'then',
 'movies',
 'make',
 'could',
 'way',
 'films',
 'any',
 'them',
 'after',
 'too',
 'characters',
 'think',
 'watch',
 'being',
 'two',
 'many',
 'seen',
 'character',
 'never',
 'little',
 'where',
 'plot',
 'acting',
 'be

## Classification Model

A simple sequential model in Keras.

In [30]:
embedding_dim = 16
model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
    GlobalAveragePooling1D(),
    Dense(16, activation="relu"),
    Dense(1)
])

## Compile and Train The Model

In [31]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [32]:
model.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=["accuracy"])

In [33]:
model.fit(train_ds,
         validation_data=val_ds,
         epochs=25,
         callbacks=[tensorboard_callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fc6d6ed1f40>

In [34]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


## Retreive and save the model

In [35]:
weights = model.get_layer("embedding").get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [36]:
out_v = io.open("vectors.tsv", "w", encoding ="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(vocab):
    if index==0:
        continue # Padding
    
    vec = weights[index]
    out_v.write("\t".join([str(x) for x in vec])+"\n")
    out_m.write(word+"\n")
out_v.close()
out_m.close()

In [37]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 96963), started 0:12:44 ago. (Use '!kill 96963' to kill it.)