In [1]:
pip install -q "tensorflow-text==2.8.*"

Note: you may need to restart the kernel to use updated packages.


In [53]:
#Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [3]:
#Open data
data_file = open('Data.txt').read()
data_file

'The important thing is not to stop questioning. Curiosity has its own reason for existence. One cannot help but be in awe when he contemplates the mysteries of eternity, of life, of the marvelous structure of reality. It is enough if one tries merely to comprehend a little of this mystery each day.'

In [4]:
#Create tokenizer object
tokenizer = Tokenizer()

#Convert data to lowercase
data = data_file.lower().split('.')

In [5]:
#Create dictionary of words with the frequency they occur
#Every word gets unique value > 0
#0 is reserved for padding
tokenizer.fit_on_texts(data)

#Transforms sentences into set of integers from the dictionary
input_sequences = tokenizer.texts_to_sequences(data)

#Counts total words
total_words = len(tokenizer.word_index) + 1

#Prints it all out
print(tokenizer.word_index, '\n')
print(input_sequences, '\n')
print(total_words, '\n')

{'of': 1, 'the': 2, 'is': 3, 'to': 4, 'one': 5, 'important': 6, 'thing': 7, 'not': 8, 'stop': 9, 'questioning': 10, 'curiosity': 11, 'has': 12, 'its': 13, 'own': 14, 'reason': 15, 'for': 16, 'existence': 17, 'cannot': 18, 'help': 19, 'but': 20, 'be': 21, 'in': 22, 'awe': 23, 'when': 24, 'he': 25, 'contemplates': 26, 'mysteries': 27, 'eternity': 28, 'life': 29, 'marvelous': 30, 'structure': 31, 'reality': 32, 'it': 33, 'enough': 34, 'if': 35, 'tries': 36, 'merely': 37, 'comprehend': 38, 'a': 39, 'little': 40, 'this': 41, 'mystery': 42, 'each': 43, 'day': 44} 

[[2, 6, 7, 3, 8, 4, 9, 10], [11, 12, 13, 14, 15, 16, 17], [5, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 27, 1, 28, 1, 29, 1, 2, 30, 31, 1, 32], [33, 3, 34, 35, 5, 36, 37, 4, 38, 39, 40, 1, 41, 42, 43, 44], []] 

45 



In [7]:
#Pad sequences
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen = 20, padding ='post', value = 0)
input_sequences

array([[ 2,  6,  7,  3,  8,  4,  9, 10,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [11, 12, 13, 14, 15, 16, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [19, 20, 21, 22, 23, 24, 25, 26,  2, 27,  1, 28,  1, 29,  1,  2,
        30, 31,  1, 32],
       [33,  3, 34, 35,  5, 36, 37,  4, 38, 39, 40,  1, 41, 42, 43, 44,
         0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0]], dtype=int32)

In [8]:
input_sequences = np.array(input_sequences)
print(input_sequences.shape)
print(input_sequences)

(5, 20)
[[ 2  6  7  3  8  4  9 10  0  0  0  0  0  0  0  0  0  0  0  0]
 [11 12 13 14 15 16 17  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [19 20 21 22 23 24 25 26  2 27  1 28  1 29  1  2 30 31  1 32]
 [33  3 34 35  5 36 37  4 38 39 40  1 41 42 43 44  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [9]:
#Shift array by one to create targets 
output_sequences = np.array(np.roll(input_sequences, 80))
output_sequences[-1] = 0
print(output_sequences.shape)
print(output_sequences)

(5, 20)
[[11 12 13 14 15 16 17  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [19 20 21 22 23 24 25 26  2 27  1 28  1 29  1  2 30 31  1 32]
 [33  3 34 35  5 36 37  4 38 39 40  1 41 42 43 44  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [18]:
input_shape = input_sequences.shape[1:]
input_shape

(20,)

In [60]:
output_shape = output_sequences.shape[1:]
output_shape

20

In [36]:
input_sequences = tf.convert_to_tensor(input_sequences)
output_sequences = tf.convert_to_tensor(output_sequences)

In [95]:
#Convert to embedding vectors
#embedding_model = keras.Sequential()

input = keras.layers.Input(shape = input_sequences.shape)

word_input = keras.layers.Input(shape = input_sequences.shape[1])

embedding_layer = keras.layers.Embedding(input_dim = total_words, output_dim = 512, mask_zero = True, input_length = input_sequences.shape[1])(word_input) 

word_vectors = keras.layers.Flatten()(embedding_layer)

embedding_model = keras.Model(word_input, word_vectors)

embedding_model.compile(loss=keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(), metrics=[keras.metrics.SparseCategoricalAccuracy()])
                    
keras.utils.plot_model(embedding_model,to_file='model.png', show_shapes=True,expand_nested=True)

word_vectors = embedding_model.predict(input_sequences)
word_vectors

array([[-0.0454743 ,  0.02437179, -0.01975303, ..., -0.04093369,
        -0.01609544,  0.01044757],
       [ 0.03825822,  0.02885522,  0.03115452, ..., -0.04093369,
        -0.01609544,  0.01044757],
       [ 0.03394261,  0.01523543,  0.03496403, ...,  0.04506305,
        -0.02487865, -0.00269923],
       [-0.00072331,  0.04929391,  0.00960779, ..., -0.04093369,
        -0.01609544,  0.01044757],
       [ 0.02883116,  0.02487377, -0.02857069, ..., -0.04093369,
        -0.01609544,  0.01044757]], dtype=float32)

In [None]:
#Calculate positional encoding
