# Natural language processing

## Encoding text

- Bag of words

In [1]:
vocab = dict()
word_encoding = 1
def bag_of_words(text):
    global word_encoding

    words = text.lower().split(' ')
    bag = dict()
    for word in words:
        if word in vocab:
            encoding = vocab[word]
        else:
            vocab[word] = word_encoding
            encoding = word_encoding
            word_encoding += 1
        
        bag.setdefault(encoding, 0)
        bag[encoding] += 1
        
    return bag

text = "this is a test to see if this test will work is is test a a"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


Bag of words doesn't encode the order of the words

In [2]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_bag = bag_of_words(positive_review)
neg_bag = bag_of_words(negative_review)

print("Positive:", pos_bag)
print("Negative:", neg_bag)

Positive: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1}
Negative: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 21: 1, 18: 1, 19: 1, 20: 1, 17: 1}


- Integer encoding

In [3]:
vocab = dict()
word_encoding = 1
def one_hot_encoding(text):
    global word_encoding

    words = text.lower().split(' ')
    encoding = []

    for word in words:
        if word in vocab:
            code = vocab[word]
            encoding.append(code)
        else:
            vocab[word] = word_encoding
            encoding.append(word_encoding)
            word_encoding += 1
    
    return encoding

text = "this is a test to see if this test will work is is test a a"
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


In [4]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_encode = one_hot_encoding(positive_review)
neg_encode = one_hot_encoding(negative_review)

print("Positive:", pos_encode)
print("Negative:", neg_encode)

Positive: [10, 11, 12, 13, 14, 15, 5, 16, 17, 18, 19, 14, 20, 21]
Negative: [10, 11, 12, 13, 14, 15, 5, 16, 21, 18, 19, 14, 20, 17]


It's better

- Word enbedings

word -> vector
It's a layer, pretrined models exist.

## Recurrent neural networks

- simple RNN layer
- LSTM

## Sentiment analysis

Import the movie review dataset

In [5]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Let's take a look at a review

In [6]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

Preprocessing

In [7]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [8]:
train_data[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     1,    14,    22,    16,
          43,   530,   973,  1622,  1385,    65,   458,  4468,    66,
        3941,     4,   173,    36,   256,     5,    25,   100,    43,
         838,   112,    50,   670, 22665,     9,    35,   480,   284,
           5,   150,     4,   172,   112,   167, 21631,   336,   385,
          39,     4,   172,  4536,  1111,    17,   546,    38,    13,
         447,     4,   192,    50,    16,     6,   147,  2025,    19,
          14,    22,     4,  1920,  4613,   469,     4,    22,    71,
          87,    12,    16,    43,   530,    38,    76,    15,    13,
        1247,     4,    22,    17,   515,    17,    12,    16,   626,
          18, 19193,     5,    62,   386,    12,     8,   316,     8,
         106,     5,

Creating the model

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


Training

In [11]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=3, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Evaluate the model

In [12]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.35722780227661133, 0.8623200058937073]


## Making predictions

In [13]:
word_index = imdb.get_word_index()

def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [14]:
reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
      if num != PAD:
        text += reverse_word_index[num] + " "

    return text[:-1]
  
print(decode_integers(encoded))

that movie was just amazing so amazing


In [15]:
def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1,250))
  pred[0] = encoded_text
  result = model.predict(pred) 
  print(result[0])

positive_review = "That movie was! really loved it and would great watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)

[0.81147456]
[0.28104258]


## RNN play generator

In [3]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

Dataset

In [4]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Loading your own data is a possibility

Let's look at the contents of the file

In [5]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [6]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



Encoding

In [7]:
vocab = sorted(set(text))

char2idx = {u:i for i,u in enumerate(vocab)}

idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [8]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


Creating training examples

In [9]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [10]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [11]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [12]:
for x, y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


Making training batches

In [13]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

Building the model

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


## Creating a loss function

Let's look at the output

In [15]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [16]:
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[-3.2729932e-04  3.1751557e-03 -5.9531932e-03 ... -1.2554172e-03
    1.8108833e-04 -6.3380776e-03]
  [ 2.1914975e-03 -1.4303051e-03 -3.8781504e-03 ... -4.2909747e-03
   -1.8989781e-03  2.0868718e-03]
  [ 1.5592678e-03 -7.0976710e-04 -6.2429840e-03 ...  4.3531740e-04
   -2.0877537e-03  7.5717107e-05]
  ...
  [ 1.5653133e-03  3.4541423e-03  9.0903407e-03 ... -2.0071308e-03
    7.2882895e-04  1.0070583e-03]
  [ 1.5111169e-03  4.2188480e-03  5.4279929e-03 ... -3.2007727e-03
    2.3605190e-03 -2.1470897e-03]
  [ 8.4742554e-05  3.7038261e-03  2.9059593e-03 ... -2.3623868e-03
   -5.3060410e-04  1.9151808e-04]]

 [[-4.1277669e-03  2.9524842e-03 -5.8646305e-03 ... -2.6433694e-04
   -5.4154010e-03 -5.5851704e-03]
  [-1.6012628e-02  2.0003407e-03 -7.0246425e-03 ...  1.2613363e-03
   -1.2339876e-02 -5.7949251e-03]
  [-1.5033163e-02  9.5824944e-05 -9.9991895e-03 ...  7.1492803e-04
   -8.1328349e-03 -4.1670986e-03]
  ...
  [ 9.1039389e-03  4.9270736e-03 -5.5722287e-03 ... -8.9185238e

In [17]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred) # notice this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step

100
tf.Tensor(
[[-3.2729932e-04  3.1751557e-03 -5.9531932e-03 ... -1.2554172e-03
   1.8108833e-04 -6.3380776e-03]
 [ 2.1914975e-03 -1.4303051e-03 -3.8781504e-03 ... -4.2909747e-03
  -1.8989781e-03  2.0868718e-03]
 [ 1.5592678e-03 -7.0976710e-04 -6.2429840e-03 ...  4.3531740e-04
  -2.0877537e-03  7.5717107e-05]
 ...
 [ 1.5653133e-03  3.4541423e-03  9.0903407e-03 ... -2.0071308e-03
   7.2882895e-04  1.0070583e-03]
 [ 1.5111169e-03  4.2188480e-03  5.4279929e-03 ... -3.2007727e-03
   2.3605190e-03 -2.1470897e-03]
 [ 8.4742554e-05  3.7038261e-03  2.9059593e-03 ... -2.3623868e-03
  -5.3060410e-04  1.9151808e-04]], shape=(100, 65), dtype=float32)


In [18]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred) # 65 values representing the probabillity of each character occuring next

65
tf.Tensor(
[-0.0003273   0.00317516 -0.00595319  0.00124646  0.00307925  0.00133406
 -0.00358953 -0.00159095  0.00034507  0.00336984 -0.0025455  -0.00380446
  0.00356946  0.00287322  0.00102077 -0.00145042  0.00325365  0.00844909
 -0.00371635 -0.00042692 -0.0031486  -0.0045115   0.00071089  0.00400226
  0.00641349 -0.00235978 -0.00122275  0.00155455  0.00073546 -0.00080741
 -0.004888   -0.00663896  0.00125374 -0.00385286  0.00483243 -0.00159594
  0.0009032  -0.00142138 -0.00044307 -0.0013528  -0.0053758   0.00347007
 -0.00421145 -0.00187597 -0.00158415  0.00551064  0.00290984  0.0013058
  0.00152392  0.00107584  0.00633888  0.00520467 -0.00531372  0.00354684
 -0.00378028  0.00167322  0.00182533 -0.00495441 -0.01033763 -0.00143316
 -0.0061624  -0.0042495  -0.00125542  0.00018109 -0.00633808], shape=(65,), dtype=float32)


In [19]:
# If we want to determine the predicted character we need to sample the output distribution (pick a value based on probabillity)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars  # and this is what the model predicted for training sequence 1

"DcnqjoKz$LffXhRTvp!SMr,UyUG Al:WjlWxGJDE!vIU-'LtW$oySTZeqGz X.TDp!qy?RMBpszy3arz3oQcry$,CJL:yTlE'J,g"

In [20]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

Compiling the model

In [21]:
model.compile(optimizer='adam', loss=loss)

Creating checkpoints

In [22]:
# Directory where the checkpoints will be saved
checkpoint_dir = 'D:/.temp/training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

Training

In [23]:
# history = model.fit(data, epochs=50, callbacks=[checkpoint_callback])

Loading the model

In [24]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

Find last checkpoint

In [25]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

Load any checkpoint

In [26]:
# checkpoint_num = 10
# model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_" + str(checkpoint_num)))
# model.build(tf.TensorShape([1, None]))

Generating text

In [27]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 800

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
    
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [29]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))

Come to me, for I had then last
with mine own folly--
And hear the sequel of your arms in loving to a happy evil:
Hich I have infect his realm as now Marcius dignift's reband?

MONTAGUE:
Many a minutes could
not back again to enter.

MISTRESS OVERDONE:
Well, well; therefore plantagenet mistrust
A trial of our slavish richard by the nose,
They be true heels, like unfolds all men ll GRICHARD III:
Faith, serve me, sonisted how to marry other.
Go men our noble ancestry athere's himself
Against both, which 'longs embraces him.

First Servingman:
Why learns that way, my lord?

PRINCE:
Sir, these abject
Shall will either perfection and their betterness, I
ngied to repent our majesty.

ADLAND:
Yes, I know him; 'tis a man of Pisa; by rinners love;
And I beseech your majesty to come again.

ROMEO:
I do beseec
