In [75]:
import io
import re
import string
import tqdm
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime

In [76]:
from tensorflow import keras
from keras import datasets, layers
from keras import models, losses
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing.sequence import make_sampling_table
from keras.utils import pad_sequences

# Launch tensorboard session
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [77]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
SEQUENCE_LENGTH = 4
# Set the number of negative samples per positive context.
NUM_NS = 4
WINDOW_SIZE = 2
SEED = 42
EMBEDDING_DIM = 64

## 2.1 The Dataset

In [78]:
# Loading data 
file_path = '../Homework10/bible.txt'

with open(file_path, "r") as f:
    text = f.read().splitlines()

for line in text[:20]:
    print(line) 

# Preparing Data for Model
text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Day, and the darkness he called Night.
And the evening and the morning were the first day.

1:6 And God said, Let there be a firmament in the midst of the waters,
and let it divide the waters from the waters.



In [79]:
vocabulary = dict((x, i) for i, x in enumerate(np.unique(list(text))))
vocab_size = len(vocabulary)
print(vocab_size, '\n')

72192 



## 2.2 Word Embeddings
    Take care of the following:
    • Convert to lower case, remove new-line characters and special characters
    • Tokenize the string into word-tokens by using one of the word-level tokenizers from tensorflow-text e.g. this one.
    • For performance purposes, we advise that you work with only a subset of all the words that are in corpus. We recommend starting out with only the 10000 most common words 3.
    Next, you need to create the input-target pairs, including a word (input) and a context word (target). We suggest a context window of (but if you want to go bigger and have the computational resources to spare, go for it). Let’s take the sentence ”The cat climbed the tree” for instance. For the input word ”climbed” we want to predict ”the, cat, the, tree”. The resulting input-target pairs to feed to the network will be (climbed, the), (climbed, cat), (climbed, the), (climbed, tree). Note that you leave out the index 0 when creating the pairs, so there is no pair (climbed, climbed). Create a data set from these pairs and batch and shuffle it.

In [80]:
# Vectorize sentences from the corpus
# Now, create a custom standardization function to lowercase the text and
# remove special characters and punctuation.
def custom_standardization(input_data: str):
    lowercase = tf.strings.lower(input_data)
    # lowercase = lowercase.split()
    return tf.strings.regex_replace(lowercase,
                                    '[%s]' % re.escape(string.punctuation), '')

In [81]:
# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

In [82]:
# Create the vocabulary for text dataset
vectorize_layer.adapt(text_ds.batch(BATCH_SIZE))
# Returns a list of all vocabulary tokens sorted (descending) by their frequency
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20], '\n')

['', '[UNK]', 'the', 'and', 'of', 'to', 'that', 'in', 'he', 'shall', 'unto', 'for', 'i', 'his', 'a', 'lord', 'they', 'be', 'is', 'him'] 



In [83]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE).map(vectorize_layer).unbatch()

In [84]:
# Flatten the dataset into a list of sentence vector sequences
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences), '\n')
# Inspect a few examples from sequences
for seq in sequences[:7]:
    print(f'{seq} => {[inverse_vocab[i] for i in seq]}')
print('\n')

74644 

[  2 219 404   4] => ['the', 'first', 'book', 'of']
[1003    7    2  684] => ['11', 'in', 'the', 'beginning']
[1002    3    2  111] => ['12', 'and', 'the', 'earth']
[  2 230   4   2] => ['the', 'face', 'of', 'the']
[302   0   0   0] => ['waters', '', '', '']
[1000    3   28   32] => ['13', 'and', 'god', 'said']
[999   3  28 181] => ['14', 'and', 'god', 'saw']




In [85]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, 
                            vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []
    # Build the sampling table for 'vocab_size' tokens.
    sampling_table = make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):
        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = skipgrams(
            sequence,
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size, 
            negative_samples=0
        )        

        # Iterate over each positive skip-gram pair to produce training examples
        # with a positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype='int64'), 1)
            negative_sample_candidate, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name='negative_sampling'
            )
        
            # Build context and label vectors (for one target word)
            context = tf.concat([tf.squeeze(context_class, 1), negative_sample_candidate], 0)
            label = tf.constant([1] + [0] * num_ns, dtype='int64')

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [86]:
# Generate training examples from sequences
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=WINDOW_SIZE,
    num_ns=NUM_NS,
    vocab_size=vocab_size,
    seed=SEED)
    
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

100%|███████████████████████████████████████████████████████████████████████████| 74644/74644 [04:13<00:00, 294.83it/s]


In [87]:
print('\n')
print(f'targets shape: {targets.shape}')
print(f'contexts shape: {contexts.shape}')
print(f'labels shape: {labels.shape}')
print('\n')



targets shape: (98085,)
contexts shape: (98085, 5)
labels shape: (98085, 5)




In [88]:
# Configure the dataset for performance
def config_dataset(targets, contexts, labels, 
                    buffer_size, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
    dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
    # Apply Dataset.cache and Dataset.prefetch to improve performance:
    dataset = dataset.cache().prefetch(buffer_size = tf.data.AUTOTUNE)

    return dataset

In [89]:
dataset = config_dataset(targets, contexts, labels, BUFFER_SIZE, BATCH_SIZE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


## 2.3 The model
    Implement a SkipGram model to create the word embeddings. There are multiple ways of implementing a Skip Gram in TensorFlow.
    
    Subclassed word2vec model
    Use the Keras Subclassing API to define my Skip model with the following layers:
    • target_embedding: A tf.keras.layers.Embedding layer, which looks up the embedding of a word when it appears as a target word. The number of parameters in this layer are (vocab_size * embedding_dim).
    • context_embedding: Another tf.keras.layers.Embedding layer, which looks up the embedding of a word when it appears as a context word. The number of parameters in this layer are the same as those in target_embedding, i.e. (vocab_size * embedding_dim).
    • dots: A tf.keras.layers.Dot layer that computes the dot product of target and context embeddings from a training pair.
    • flatten: A tf.keras.layers.Flatten layer to flatten the results of dots layer into logits.
    With the subclassed model, i can define the call() function that accepts (target, context) pairs which can then be passed into their corresponding embedding layer. Reshape the context_embedding to perform a dot product with target_embedding and return the flattened result. 

In [90]:
# Create Skip Gram model class
class SkipGram(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_ns):
        super(SkipGram, self).__init__()
        self.target_embedding = layers.Embedding(
            vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
        self.context_embedding = layers.Embedding(
            vocab_size, embedding_dim, input_length=num_ns)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        # dots: (batch, context)
        return dots

In [91]:
# Define loss function
def custom_loss(x_logit, y_true):
    return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [92]:
skip_gram = SkipGram(vocab_size, EMBEDDING_DIM, NUM_NS)
skip_gram.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics = ['accuracy'])

# Define a callback to log training statistics for TensorBoard
EXPERIMENT_NAME = 'Word embedding'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(f'./logs/{EXPERIMENT_NAME}/{current_time}')

## 2.4 Training 

In [93]:
# Train the model
skip_gram.fit(
        dataset, 
        epochs = 20,
        callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2045ca678b0>

In [94]:
# Let's take a look at a summary of Skip Gram model
skip_gram.summary()

Model: "skip_gram"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 w2v_embedding (Embedding)   multiple                  4620288   
                                                                 
 embedding (Embedding)       multiple                  4620288   
                                                                 
Total params: 9,240,576
Trainable params: 9,240,576
Non-trainable params: 0
_________________________________________________________________


In [95]:
%tensorboard --logdir logs

In [96]:
# Obtain the weights from the model using Model.get_layer and Layer.get_weights.
weights = skip_gram.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [97]:
# Create and save the vectors and metadata files
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()