In [2]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample text data
corpus = [
    'I love programming',
    'Programming is fun',
    'Deep learning is interesting',
    'Natural language processing is exciting'
]

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(total_words)

13


In [24]:
# Get the mapping of indices to words
index_to_word = tokenizer.index_word

# Print the actual words for the all indices
for index in range(0, total_words+1):
    word = index_to_word.get(index, 'Unknown')
    print(f"Index {index}: Word '{word}'")

Index 0: Word 'Unknown'
Index 1: Word 'is'
Index 2: Word 'programming'
Index 3: Word 'i'
Index 4: Word 'love'
Index 5: Word 'fun'
Index 6: Word 'deep'
Index 7: Word 'learning'
Index 8: Word 'interesting'
Index 9: Word 'natural'
Index 10: Word 'language'
Index 11: Word 'processing'
Index 12: Word 'exciting'
Index 13: Word 'Unknown'


In [25]:
# Create input sequences and labels
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences

[[3, 4],
 [3, 4, 2],
 [2, 1],
 [2, 1, 5],
 [6, 7],
 [6, 7, 1],
 [6, 7, 1, 8],
 [9, 10],
 [9, 10, 11],
 [9, 10, 11, 1],
 [9, 10, 11, 1, 12]]

In [26]:
# Pad sequences for equal length
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

input_sequences

array([[ 0,  0,  0,  3,  4],
       [ 0,  0,  3,  4,  2],
       [ 0,  0,  0,  2,  1],
       [ 0,  0,  2,  1,  5],
       [ 0,  0,  0,  6,  7],
       [ 0,  0,  6,  7,  1],
       [ 0,  6,  7,  1,  8],
       [ 0,  0,  0,  9, 10],
       [ 0,  0,  9, 10, 11],
       [ 0,  9, 10, 11,  1],
       [ 9, 10, 11,  1, 12]], dtype=int32)

In [37]:
input_sequences[2]

array([0, 0, 0, 2, 1], dtype=int32)

In [36]:

# Create input features and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]
print(X[2], y[2])
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

print(X[2], y[2])

[0 0 0 2] 1
[0 0 0 2] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
''' PARAMETERS OF EMBEDDING() LAYER FUNCTION
total_words: The first parameter specifies the size of the vocabulary, i.e., the total number of unique words in your corpus.
In this case, it is calculated as len(tokenizer.word_index) + 1, where tokenizer.word_index contains the mapping of words to
indices.
50 (vector size): The second parameter is the dimensionality of the word vectors. In this case, it is set to 50. Each word
in the vocabulary will be represented as a vector of length 50 in the embedding space.
input_length=max_sequence_length-1: The third parameter is the length of input sequences that the embedding layer will receive.
 It should match the length of your input sequences, which is set to max_sequence_length-1 to exclude the last word, which is
 the target word.
'''

In [38]:
# Build a simple LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 50, input_length=max_sequence_length-1),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x78cd1afa99f0>

In [41]:
# Assuming you have trained your model and named it 'model'

# Get the weights of the embedding layer
embedding_weights = model.layers[0].get_weights()[0]  # Assuming Embedding layer is the first layer

# Get the vocabulary size and embedding dimension
vocab_size, embedding_dim = embedding_weights.shape

# Create a dictionary to map words to their corresponding vectors
word_vectors = {}
word_index = tokenizer.word_index  # Assuming you have a tokenizer for your text data
for word, index in word_index.items():
    # Skip the padding token if present
    if index == 0:
        continue
    # Get the embedding vector for the word
    embedding_vector = embedding_weights[index]
    # Store the word vector in the dictionary
    word_vectors[word] = embedding_vector

# Now you can access the word vectors using the word as key
print(word_vectors['love'])  # Replace 'word' with the actual word you want to get the vector for


[ 0.09916745  0.14381759  0.10741768 -0.06482245 -0.14453943  0.05845503
  0.11049834 -0.08802968 -0.09631979  0.13130462  0.05166156  0.10901786
 -0.13262478  0.16100314  0.07237987 -0.00364961 -0.1476928   0.03410875
 -0.0566395   0.05281119  0.1128921  -0.09121816 -0.10135016 -0.01110511
 -0.03484494  0.15506019  0.10438553 -0.06440771 -0.12294479  0.13778603
 -0.14966315  0.08684783  0.150789   -0.0264182   0.13509698 -0.03096593
  0.11994029  0.10678666 -0.14893937  0.11432906  0.12786056 -0.09751885
 -0.1112318  -0.08798535 -0.06698811 -0.1169666   0.09707578 -0.11540285
  0.04978655 -0.14818183]
