In [2]:
vocab = {}  # maps word to integer representing it
word_encoding = 1
def bag_of_words(text):
  global word_encoding

  words = text.lower().split(" ")  # create a list of all of the words in the text, well assume there is no grammar in our text for this example
  bag = {}  # stores all of the encodings and their frequency

  for word in words:
    if word in vocab:
      encoding = vocab[word]  # get encoding from vocab
    else:
      vocab[word] = word_encoding
      encoding = word_encoding  # so to keep a count of the no of times a word was used
      word_encoding += 1   # this line and the above line isn' related to each other

    if encoding in bag:  #this keeps a count here to check and add the frequency
      bag[encoding] += 1
    else:
      bag[encoding] = 1

  return bag

text = "this is a test to see if this test will work is is test a a"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


In [3]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_bag = bag_of_words(positive_review)
neg_bag = bag_of_words(negative_review)

print("Positive:", pos_bag)
print("Negative:", neg_bag)

Positive: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1}
Negative: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 21: 1, 18: 1, 19: 1, 20: 1, 17: 1}


In [4]:
vocab = {}
word_encoding = 1
def one_hot_encoding(text):
  global word_encoding

  words = text.lower().split(" ")
  encoding = []

  for word in words:
    if word in vocab:
      code = vocab[word]
      encoding.append(code)
    else:
      vocab[word] = word_encoding
      encoding.append(word_encoding)
      word_encoding += 1

  return encoding

text = "this is a test to see if this test will work is is test a a"
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


In [5]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_encode = one_hot_encoding(positive_review)
neg_encode = one_hot_encoding(negative_review)

print("Positive:", pos_encode)
print("Negative:", neg_encode)

Positive: [10, 11, 12, 13, 14, 15, 5, 16, 17, 18, 19, 14, 20, 21]
Negative: [10, 11, 12, 13, 14, 15, 5, 16, 21, 18, 19, 14, 20, 17]


In [6]:
%tensorflow_version 2.x  # this line is not required unless you are in a notebook
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [7]:
# Lets look at one review
len(train_data[1])

189

In [8]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN) # keras has this function that add and trim off the len if it's not 250

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2843041 (10.85 MB)
Trainable params: 2843041 (10.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=1, validation_split=0.2)  # not for too mab]ny epoch here cause it's taking time otherwise more epoch is great



In [12]:
results = model.evaluate(test_data, test_labels)
print(results)  # two outputs one is loss and one is accuracy

[0.31473198533058167, 0.8667200207710266]


In [14]:
word_index = imdb.get_word_index()

def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [15]:
# while were at it lets make a decode function

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
      if num != PAD:
        text += reverse_word_index[num] + " "

    return text[:-1]  # this simply removes the last " "

print(decode_integers(encoded))

that movie was just amazing so amazing


In [18]:
# now time to make a prediction

def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1,250)) # make a pred variable for all zeros of shape 1x250 cause the data we expect is of some number of entries x 250
  pred[0] = encoded_text # here we just join both of them the pred with the encoded_text (formed a list of encoded text)
  result = model.predict(pred)
  print(result[0]) # list in a list to access the first element of that list we need the indexing as to be 0
  # print(pred)
  if(result[0] > 0.5):
    print('Positive review detected!')
  else:
    print('Negative review detected!')

positive_review = "That movie was! really loved it and would great watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)


[0.8717071]
Positive review detected!
[0.4540866]
Negative review detected!


In [2]:
%tensorflow_version 2.x  # this line is not required unless you are in a notebook
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [3]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [16]:
from google.colab import files
path_to_file = list(files.upload().keys())[0]

In [5]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [6]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [7]:
vocab = sorted(set(text))
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [8]:
# lets look at how part of our text is encoded
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [9]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


In [10]:
seq_length = 100  # length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1)  # this basically tells the number of traning examples that could be taken for the text for one epoch

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
print(char_dataset) # what this did is basically sliced all the string data into stream of characters to all that 1.1 million data characters (text) is shakespeare

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>


In [12]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True) # let's say we have 105 characters in our len(text) for one time run we are going to trip it to 101
# batch(seq_length is the length for each sequence) 101 is what is required rn (this we do because we want to map on to this afterwards in the next function)

In [13]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry
# we can do the sequences by mapping them to the function

In [17]:
for x, y in dataset.take(2): # two represents the sequences of trainable data that's 2 and x and y works for input and output statements
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [18]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True) # shuffle the data is basically the data we get at a time every epoch and then batch size them

In [19]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5330241 (20.33 MB)
Trainable params: 5330241 (20.33 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
for input_example_batch, target_example_batch in data.take(1): # creating a loss function
  example_batch_predictions = model(input_example_batch)  # ask our model for a prediction on our first batch of training data (64 entries)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")  # print out the output shape

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [21]:
# we can see that the predicition is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions)) # 64 total predictions
print(example_batch_predictions) # each of them prediction of what character could come next that is 65 character's probability of being the next character

64
tf.Tensor(
[[[-6.6562621e-03 -6.2509393e-03  6.5973583e-03 ...  6.8919417e-03
   -2.9821561e-03  1.2209292e-03]
  [-1.5615004e-03 -6.5179034e-03  5.1708366e-03 ...  3.6799298e-03
   -3.3513114e-03  2.2133882e-03]
  [ 2.6552146e-03 -1.0529354e-02  1.1287365e-02 ... -1.4594041e-03
   -3.3430716e-03  4.5694029e-03]
  ...
  [ 4.0199356e-03 -7.0387921e-03 -1.2453422e-03 ...  1.7238604e-03
   -4.5831208e-03 -1.1145329e-03]
  [ 2.5631809e-03 -6.1245002e-03  8.1057753e-04 ...  2.8249256e-03
   -5.0391606e-03 -1.7902120e-03]
  [ 2.3985710e-03 -5.0807828e-03  1.3089085e-03 ...  7.8770462e-03
   -4.7758734e-03 -2.6597013e-04]]

 [[-3.9371885e-03 -3.7110418e-03 -1.6933642e-03 ...  2.2689081e-03
    7.5603893e-04 -2.9975565e-03]
  [-6.4293495e-03  7.1477832e-04 -3.2383362e-03 ... -7.1461825e-04
    4.5869015e-03 -2.6686373e-03]
  [-2.7207448e-03 -4.8810616e-05 -1.4804297e-03 ...  4.0148608e-03
    3.1729159e-03  6.6614198e-04]
  ...
  [-4.1151149e-03 -5.2351104e-03 -5.1607755e-03 ...  1.0226725e

In [22]:
# lets examine one prediction
pred = example_batch_predictions[0]
print(len(pred))
print(pred)
# notice this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step

100
tf.Tensor(
[[-0.00665626 -0.00625094  0.00659736 ...  0.00689194 -0.00298216
   0.00122093]
 [-0.0015615  -0.0065179   0.00517084 ...  0.00367993 -0.00335131
   0.00221339]
 [ 0.00265521 -0.01052935  0.01128737 ... -0.0014594  -0.00334307
   0.0045694 ]
 ...
 [ 0.00401994 -0.00703879 -0.00124534 ...  0.00172386 -0.00458312
  -0.00111453]
 [ 0.00256318 -0.0061245   0.00081058 ...  0.00282493 -0.00503916
  -0.00179021]
 [ 0.00239857 -0.00508078  0.00130891 ...  0.00787705 -0.00477587
  -0.00026597]], shape=(100, 65), dtype=float32)


In [23]:
# and finally well look at a prediction at the first timestep
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
# and of course its 65 values representing the probabillity of each character occuring next

65
tf.Tensor(
[-6.6562621e-03 -6.2509393e-03  6.5973583e-03  3.2110182e-03
  3.6946475e-04 -3.7371381e-03  2.4953049e-03  1.3800042e-03
 -2.4038402e-04  8.3821360e-04  4.4876309e-03  1.0372584e-03
 -1.5536032e-04  3.9805616e-03 -1.5067154e-03 -9.6787128e-04
 -1.4555521e-04 -5.1938826e-03  2.6772509e-04 -2.1575596e-03
  3.4398115e-03  8.2159491e-04  2.2287644e-03 -2.5640614e-03
  6.5980450e-04  1.3191384e-03  8.8796054e-04  1.0127691e-03
  5.3809648e-03 -5.0393012e-03  6.6760695e-05  3.4107747e-03
 -9.6277483e-03  3.1510717e-04 -6.8989764e-03 -2.4129963e-03
  5.2518877e-03  3.9425981e-03  1.8721852e-03 -4.5279245e-04
  3.5685652e-03 -1.6247621e-03  6.5710105e-04  2.5349073e-03
 -3.3043926e-03 -1.5282098e-03  2.8020819e-03 -9.2649140e-04
 -1.9404087e-03  6.1734521e-05  5.2349092e-03  6.4825905e-03
 -3.6591985e-03  2.7226366e-04  2.6539369e-03  3.2993773e-04
  2.1286928e-03 -1.1044986e-03 -9.7449170e-04  1.8042559e-04
 -4.9414858e-03  1.9832575e-03  6.8919417e-03 -2.9821561e-03
  1.220929

In [25]:
# If we want to determine the predicted character we need to sample the output distribution (pick a value based on probabillity)
sampled_indices = tf.random.categorical(pred, num_samples=1) # we took the random character

# now we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices) # we sampled it cause it's not always that the most common is the one that's going to be the next character

predicted_chars  # and this is what the model predicted for training sequence 1 # this isn't trained that's why the result here is awful

"Fy'uYIBYsoWcBmZvh:zPaAzwotP Kfn,e,OiZt;HBLse'PT,&oqy.zvQZsTLiOLoktJ;jTRRNnG$R,TuXBuUkb&CbHgMha&dDYTj"

In [28]:
def loss(labels, logits): # build in loss function from keras (logits is the probability distribution)
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [29]:
model.compile(optimizer='adam', loss=loss)

In [31]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, # checkpoint
    save_weights_only=True)
# This will allow us to load our model from a checkpoint & continue training it cause we have created a epoch of 50 and it can go further to 100 as wellwhich will take alot of time

In [34]:
history = model.fit(data, epochs=1, callbacks=[checkpoint_callback])

In [None]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) #loading checkpoint the latest one
model.build(tf.TensorShape([1, None]))

In [None]:
checkpoint_num = 10
model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_" + str(checkpoint_num)))
model.build(tf.TensorShape([1, None])) # here loading checkpoint any intermedite one (shape is basically one sequence and the length of sequence is not defined)

In [None]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 800

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0) # increase the dimention of it cause that's what it expects so earlier it was [1,2,4,5,6] not it's [[1,2,4,5,6]]

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()  # reset it cause when we rebuild the model it had saved or stored the data of last state that it remembered when it was training
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension

      predictions = tf.squeeze(predictions, 0) # removes the extra outer dimention of the prediction that we'll get

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy() # will give us the that random which it feels is to be given (categorical distribution)
      # predicted_id is the sampled output by the model which it has learned in it's training

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0) # and that we'll going to add that to the input eval which is basically (inp) itself in idx form

      text_generated.append(idx2char[predicted_id]) # then append it to an empty list and convert it to the character it is

  return (start_string + ''.join(text_generated)) # then join it to the starting string that's (inp)

In [None]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))