In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

Define Training Sentences

In [2]:
Train_data = ["It is a sunny day","It is a cloudy day","Will it rain today or not?"]

Set up tokenizer

In [3]:
tokenizer = Tokenizer(num_words =100)
tokenizer.fit_on_texts(Train_data)
word_index = tokenizer.word_index

In [4]:
word_index

{'it': 1,
 'is': 2,
 'a': 3,
 'day': 4,
 'sunny': 5,
 'cloudy': 6,
 'will': 7,
 'rain': 8,
 'today': 9,
 'or': 10,
 'not': 11}

Create Sequences

In [5]:
sequences = tokenizer.texts_to_sequences(Train_data)

In [6]:
sequences

[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 1, 8, 9, 10, 11]]

These sequences will not going to have new words.It will just leave that word.

In [7]:
new_sentences = ["Sunny day is fine"]
new_sequences = tokenizer.texts_to_sequences(new_sentences)
new_sequences
#It will only show sequences of those words which are present in my training set.
# To deal with this we use oov_token, which gives encoding 1 to out of vocabulary words.
# tokenizer = Tokenizer(num_words =100, oov_token = "<oov>")

[[5, 4, 2]]

In [8]:
#training set sentences should have same number of words - can be achieved by padding or truncation
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_seq = pad_sequences(sequences)
padded_seq
sequences

[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 1, 8, 9, 10, 11]]

In [9]:
padded_seq

array([[ 0,  1,  2,  3,  5,  4],
       [ 0,  1,  2,  3,  6,  4],
       [ 7,  1,  8,  9, 10, 11]])

In [10]:
#customising the padded_seq
padded_new = pad_sequences(sequences,padding = "post") #maxlen = 5, truncating = "post"
padded_new

array([[ 1,  2,  3,  5,  4,  0],
       [ 1,  2,  3,  6,  4,  0],
       [ 7,  1,  8,  9, 10, 11]])

#Word Embedding - Dense or numerical vector representations of words
They capture semantic meaning: "king" and "queen" will have similar embeddings.

In [11]:
import numpy as np
import tensorflow_datasets as tfds
print(tf.__version__)

2.19.0


In [12]:
#downloading imdb dataset
data,info = tfds.load("imdb_reviews", with_info =True, as_supervised = True)

In [13]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='C:\\Users\\Manny\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    nondeterministic_order=False,
    splits={
        'test': <

In [14]:
train_data, test_data = data['train'], data['test']
train_sentences = []
test_sentences = []
train_labels = []
test_labels = []

In [15]:
#iterate over train data to extract sentences & labels
#.numpy() converts the tf.Tensor to a NumPy object (i.e., raw bytes).
#.decode('utf8') converts those bytes to a string.
for sentence,label in train_data:
    train_sentences.append(str(sentence.numpy().decode('utf8')))
    train_labels.append(label.numpy())

for sentence,label in test_data:
    test_sentences.append(str(sentence.numpy().decode('utf8')))
    test_labels.append(label.numpy())

In [16]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

### Data Preparation - Setting up tokenizer

In [17]:
vocab_size = 10000
oov_tok = '<oov>'
embedding_dim = 15
max_length = 150
truncating = 'post'


tokenizer = Tokenizer(num_words =vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
train_seq = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_seq, maxlen = max_length, truncating = truncating)

test_seq = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_seq, maxlen = max_length, truncating = truncating)

In [18]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i,'?') for i in text ])

print(train_sentences[1])
print(train_padded[1])
print(decode_review(train_padded[1]))

I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0   11   26   75  571
    6  805 2354  313  106   19   12    7  629  686    6    4 2219    5
  181  584   64 1454  110 2263    3 3951   21    2    1    3  258   41
 4677    4  174  188   21  

### Define the neural network with embedding layer
1. Use the Sequential API
2. Add an embedding input layer of input size equal to vocab size
3. Add a flatten layer, and 2 dense layer

In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')])
# compile the model with loss function, optimizer, & metrics
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

In [21]:
len(test_padded)

25000

In [22]:
len(test_labels)

25000

In [23]:
print(type(train_padded), type(train_labels))
print(np.shape(train_padded), np.shape(train_labels))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(25000, 150) (25000,)


### Training Model

In [20]:
train_padded = train_padded[:25000]
num_epochs = 10
model.fit(train_padded,train_labels,epochs = num_epochs,validation_data = (test_padded,test_labels))

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.6318 - loss: 0.6110 - val_accuracy: 0.8362 - val_loss: 0.3681
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9078 - loss: 0.2485 - val_accuracy: 0.8258 - val_loss: 0.3940
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9711 - loss: 0.1143 - val_accuracy: 0.8220 - val_loss: 0.4594
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9942 - loss: 0.0355 - val_accuracy: 0.8112 - val_loss: 0.5626
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9980 - loss: 0.0118 - val_accuracy: 0.8112 - val_loss: 0.6362
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9997 - loss: 0.0045 - val_accuracy: 0.8101 - val_loss: 0.6916
Epoch 7/10
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x1709de1aab0>

### Derieving Weights from embedding layers

In [25]:
ll = model.layers[0]

#extracting learned weights
weights = ll.get_weights()[0]
print(weights.shape)
print(weights)

(10000, 15)
[[-0.00970541  0.04358897  0.05723758 ...  0.05473125 -0.00485928
  -0.02095112]
 [ 0.05941346  0.07290929  0.07515416 ...  0.06891727  0.01599611
  -0.07985418]
 [-0.01457466  0.13918164  0.14539869 ...  0.17153877  0.00908752
  -0.01000997]
 ...
 [-0.13791926 -0.11145771 -0.00449798 ... -0.02784112 -0.15250528
  -0.07114182]
 [-0.04097483  0.00964436  0.05315474 ...  0.02515131 -0.16377777
   0.07427062]
 [-0.07930727 -0.0995072   0.01404452 ... -0.14835927 -0.05205519
  -0.19359782]]
