##### Copyright 2020 The TensorFlow Authors.

In [55]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [9]:
import io
import re
import string
import tqdm
import pandas as pd

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [13]:
df2 = pd.read_csv('archive/ready_poems.csv')

In [14]:
df2.head()

Unnamed: 0.1,Unnamed: 0,title,author,content,tags
0,0,Do not go gentle into that good night,Dylan Thomas,do not go gentle into that good night NEWLINE ...,[]
1,1,How Do I Love Thee? (Sonnet 43),Elizabeth Barrett Browning,how do i love thee let me count the ways NEWLI...,[]
2,2,Shall I compare thee to a summer’s day? (Sonne...,William Shakespeare,shall i compare thee to a summers day NEWLINE ...,[]
3,3,If—,Rudyard Kipling,if you can keep your head when all about you N...,[]
4,4,Nothing Gold Can Stay,Robert Frost,natures first green is gold NEWLINE her hardes...,[]


In [15]:
text = list(df2['content'])

In [16]:
text = '\n\n'.join(text)
text = text.replace('NEWLINE','\n')

In [17]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

text = " ".join([ps.stem(word) for word in text.split(" ")])

In [18]:
print(text[:120000])

do not go gentl into that good night 
 old age should burn and rave at close of day 
 rage rage against the die of the light 
 though wise men at their end know dark is right 
 becaus their word had fork no lightn they 
 do not go gentl into that good night 
 good men the last wave by cri how bright 
 their frail deed might have danc in a green bay 
 rage rage against the die of the light 
 wild men who caught and sang the sun in flight 
 and learn too late they griev it on it way 
 do not go gentl into that good night 
 grave men near death who see with blind sight 
 blind eye could blaze like meteor and be gay 
 rage rage against the die of the light 
 and you my father there on the sad height 
 curs bless me now with your fierc tear i pray 
 do not go gentl into that good night 
 rage rage against the die of the light

how do i love thee let me count the way 
 i love thee to the depth and breadth and height 
 my soul can reach when feel out of sight 
 for the end of be and ideal gra

In [19]:
with open('Poetry_content.txt', 'w') as f:
    f.write(text)

In [20]:
with open('Poetry_content.txt') as f:
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

do not go gentl into that good night 
 old age should burn and rave at close of day 
 rage rage against the die of the light 
 though wise men at their end know dark is right 
 becaus their word had fork no lightn they 
 do not go gentl into that good night 
 good men the last wave by cri how bright 
 their frail deed might have danc in a green bay 
 rage rage against the die of the light 
 wild men who caught and sang the sun in flight 
 and learn too late they griev it on it way 
 do not go gentl into that good night 
 grave men near death who see with blind sight 
 blind eye could blaze like meteor and be gay 
 rage rage against the die of the light 
 and you my father there on the sad height 
 curs bless me now with your fierc tear i pray 
 do not go gentl into that good night 
 rage rage against the die of the light



In [21]:
text_ds = tf.data.TextLineDataset('Poetry_content.txt').filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [22]:
text_ds

<_FilterDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

### Vectorize sentences from the corpus

In [23]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 15000
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [24]:
vectorize_layer.adapt(text_ds.batch(1024))

In [25]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'of', 'a', 'to', 'i', 'in', 'it', 'is', 'that', 'you', 'my', 'with', 'not', 'on', 'for', 'as', 'hi']


In [26]:
len(inverse_vocab)

15000

In [27]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

### Obtain sequences from the dataset

In [28]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

988980


In [29]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 50  15  89 974  54  11 166  83   0   0] => ['do', 'not', 'go', 'gentl', 'into', 'that', 'good', 'night', '', '']
[ 106  440  218  281    3 4608   29  265    4   65] => ['old', 'age', 'should', 'burn', 'and', 'rave', 'at', 'close', 'of', 'day']
[858 858 230   2 152   4   2  81   0   0] => ['rage', 'rage', 'against', 'the', 'die', 'of', 'the', 'light', '', '']
[147 885 156  29  42 184  66 119  10 224] => ['though', 'wise', 'men', 'at', 'their', 'end', 'know', 'dark', 'is', 'right']
[ 149   42  126   69 2151   45 1169   33    0    0] => ['becaus', 'their', 'word', 'had', 'fork', 'no', 'lightn', 'they', '', '']


### Generate training examples from sequences

In [30]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


100%|█████████████████████████████████| 988980/988980 [03:40<00:00, 4485.70it/s]




targets.shape: (2798005,)
contexts.shape: (2798005, 5)
labels.shape: (2798005, 5)


### Configure the dataset for performance

In [31]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [32]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


## Model and training

In [33]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [34]:
targets

array([974, 974, 974, ..., 106, 106, 106])

### Define loss function and compile model


In [36]:
embedding_dim = 64
num_ns = 4
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

Also define a callback to log training statistics for TensorBoard:

In [37]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

Train the model on the `dataset` for some number of epochs:

In [38]:
dataset

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>

In [39]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x16390a890>

TensorBoard now shows the word2vec model's accuracy and loss:

In [40]:
#docs_infra: no_execute


%tensorboard --logdir logs --port=6005

<!-- <img class="tfo-display-only-on-site" src="images/word2vec_tensorboard.png"/> -->

## Embedding lookup and analysis

Obtain the weights from the model using `Model.get_layer` and `Layer.get_weights`. The `TextVectorization.get_vocabulary` function provides the vocabulary to build a metadata file with one token per line.

In [41]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [42]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [43]:
weights.shape

(15000, 64)

In [44]:
vocab[0] = '[PAD]'
vocab

['[PAD]',
 '[UNK]',
 'the',
 'and',
 'of',
 'a',
 'to',
 'i',
 'in',
 'it',
 'is',
 'that',
 'you',
 'my',
 'with',
 'not',
 'on',
 'for',
 'as',
 'hi',
 'he',
 'we',
 'wa',
 'from',
 'thi',
 'but',
 'me',
 'all',
 'like',
 'at',
 'be',
 'her',
 'are',
 'they',
 'or',
 'have',
 'your',
 'one',
 'what',
 'by',
 'will',
 'when',
 'their',
 'she',
 'so',
 'no',
 'there',
 'our',
 'who',
 'out',
 'do',
 'would',
 'if',
 'an',
 'into',
 'love',
 'up',
 'where',
 'then',
 'now',
 'were',
 'am',
 'them',
 'us',
 'how',
 'day',
 'know',
 'him',
 'say',
 'had',
 'time',
 'which',
 'down',
 'can',
 'through',
 'come',
 'eye',
 'see',
 'over',
 'hand',
 'more',
 'light',
 'could',
 'night',
 'back',
 'man',
 'look',
 'still',
 'make',
 'go',
 'some',
 'here',
 'did',
 'way',
 'never',
 'long',
 'ha',
 'life',
 'other',
 'onli',
 'thing',
 'live',
 'even',
 'heart',
 'each',
 'world',
 'old',
 'want',
 'said',
 'about',
 'face',
 'too',
 'than',
 'god',
 'let',
 'bodi',
 'just',
 'befor',
 'white'

In [45]:
import pandas as pd
Emb_df = pd.DataFrame(weights)
Emb_df.shape

(15000, 64)

In [46]:
Emb_df['Word'] = vocab

In [47]:
Emb_df.to_csv('WordEmbeddings_TF_64.csv',index = False)