<a href="https://colab.research.google.com/github/BlackCurrantDS/DeepLearning/blob/main/IDL_Assignments6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference- Last assignment+https://www.tensorflow.org/tutorials/text/text_generation

In [14]:
import tensorflow as tf
import numpy as np
import keras

In [15]:
!python prepare_data2.py shk_input.txt skp \\n -m 500

2020-12-03 20:42:59.441724: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
Split input into 40000 sequences...
Longest sequence is 65 characters. If this seems unreasonable, consider using the maxlen argument!
Removing sequences longer than 500 characters...
40000 sequences remaining.
Longest remaining sequence has length 65.
Removing length-0 sequences...
40000 sequences remaining.
Serialized 100 sequences...
Serialized 200 sequences...
Serialized 300 sequences...
Serialized 400 sequences...
Serialized 500 sequences...
Serialized 600 sequences...
Serialized 700 sequences...
Serialized 800 sequences...
Serialized 900 sequences...
Serialized 1000 sequences...
Serialized 1100 sequences...
Serialized 1200 sequences...
Serialized 1300 sequences...
Serialized 1400 sequences...
Serialized 1500 sequences...
Serialized 1600 sequences...
Serialized 1700 sequences...
Serialized 1800 sequences...
Serialized 1900 sequences...
S

In [16]:
#Getting the data
from prepare_data2 import parse_seq
import pickle

# this is just a datasets of "bytes" (not understandable)
data = tf.data.TFRecordDataset("skp.tfrecords")

# this maps a parser function that properly interprets the bytes over the dataset
# (with fixed sequence length 200)
# if you change the sequence length in preprocessing you also need to change it here
data = data.map(parse_seq) #since changing seq lengths

# a map from characters to indices
vocab = pickle.load(open("skp_vocab", mode="rb"))
vocab_size = len(vocab)
# inverse mapping: indices to characters
ind_to_ch = {ind: ch for (ch, ind) in vocab.items()}

print(vocab)
print(vocab_size)

{'e': 3, 'j': 4, 'i': 5, 'm': 6, 'p': 7, 's': 8, 'u': 9, 'L': 10, 'x': 11, '&': 12, '$': 13, 'f': 14, 'Q': 15, 'v': 16, 'R': 17, 'D': 18, 'h': 19, ',': 20, 'w': 21, 'C': 22, 'M': 23, '\n': 24, "'": 25, 'g': 26, 'T': 27, '?': 28, 'W': 29, 'J': 30, 'y': 31, 'U': 32, '3': 33, 'F': 34, 'o': 35, 'z': 36, '.': 37, 'A': 38, 'l': 39, 'b': 40, '!': 41, 'q': 42, ':': 43, 'd': 44, 'H': 45, 'V': 46, 'G': 47, 'N': 48, 'r': 49, 'P': 50, 'X': 51, 'S': 52, '-': 53, 'Z': 54, 'k': 55, ' ': 56, 'O': 57, 'c': 58, 'K': 59, 't': 60, 'B': 61, 'a': 62, 'I': 63, 'E': 64, ';': 65, 'n': 66, 'Y': 67, '<PAD>': 0, '<S>': 1, '</S>': 2}
68


In [17]:
#for current and expected time stamps
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

data = data.map(split_input_target)

In [18]:
# Batch size
BATCH_SIZE = 128

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, drop_remainder=True) #to make the batch size of eqal since sequence is of variable lengths

In [19]:
def build_model(vocab_size, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 256, batch_input_shape=[BATCH_SIZE, None]),
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [20]:
model = build_model(
    vocab_size=68,
    rnn_units=512,
    batch_size=BATCH_SIZE)

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 256)          17408     
_________________________________________________________________
lstm (LSTM)                  (128, None, 512)          1574912   
_________________________________________________________________
dense (Dense)                (128, None, 68)           34884     
Total params: 1,627,204
Trainable params: 1,627,204
Non-trainable params: 0
_________________________________________________________________


In [22]:

#running it
epoch = 50
opt = tf.keras.optimizers.Adam()
for e in range(epoch):
  for batch_num, (batch_data,y) in enumerate(data): #this is already batched of 128 batch size
    total_loss = 0.0 #total loss over sequence 
    
    with tf.GradientTape(persistent=True) as tape:
      actual_chars = tf.TensorArray(tf.int64, size=BATCH_SIZE)
      for char_pos, which_seq in enumerate(batch_data): #goign over the sequence in each batch
            actual_chars = actual_chars.write(char_pos, tf.math.count_nonzero(which_seq))
      mask = tf.sequence_mask(actual_chars.stack(), dtype=tf.float32)
      logits = model(batch_data)
      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = logits)
      loss = loss * mask
        
    grads = tape.gradient(loss, model.trainable_variables) #gradients against the parametes/weights and biasas
    opt.apply_gradients(zip(grads, model.trainable_variables))
    total_loss = tf.reduce_sum(loss)/float(tf.reduce_sum(actual_chars.stack())) # this is total loss over the whole batch
    if batch_num % 100 == 0:
          print('Epoch {} Batch {} Loss {}'.format(e, batch_num, total_loss))
    
      
  model.reset_states()

Epoch 0 Batch 0 Loss 4.21833610534668
Epoch 0 Batch 100 Loss 2.5078413486480713
Epoch 0 Batch 200 Loss 2.231553316116333
Epoch 0 Batch 300 Loss 2.0322964191436768
Epoch 1 Batch 0 Loss 2.3398070335388184
Epoch 1 Batch 100 Loss 1.8988629579544067
Epoch 1 Batch 200 Loss 1.844454288482666
Epoch 1 Batch 300 Loss 1.761122465133667
Epoch 2 Batch 0 Loss 1.9163339138031006
Epoch 2 Batch 100 Loss 1.6964704990386963
Epoch 2 Batch 200 Loss 1.6497725248336792
Epoch 2 Batch 300 Loss 1.658891201019287
Epoch 3 Batch 0 Loss 1.7464948892593384
Epoch 3 Batch 100 Loss 1.5683289766311646
Epoch 3 Batch 200 Loss 1.5081709623336792
Epoch 3 Batch 300 Loss 1.5550062656402588
Epoch 4 Batch 0 Loss 1.5588258504867554
Epoch 4 Batch 100 Loss 1.485350489616394
Epoch 4 Batch 200 Loss 1.4992015361785889
Epoch 4 Batch 300 Loss 1.4667127132415771
Epoch 5 Batch 0 Loss 1.4824469089508057
Epoch 5 Batch 100 Loss 1.4501445293426514
Epoch 5 Batch 200 Loss 1.402001976966858
Epoch 5 Batch 300 Loss 1.3822622299194336
Epoch 6 Batc

In [23]:
tf.saved_model.save(model, "/content/drive/MyDrive/lstm_folder")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /content/drive/MyDrive/lstm_folder/assets


In [None]:
import os
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/drive/MyDrive/chkpoint_folder'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [None]:
model.load_weights(checkpoint_dir)

In [33]:
tf.train.latest_checkpoint(checkpoint_dir)

In [43]:
model = build_model(
    vocab_size=68,
    rnn_units=512,
    batch_size=1)

model.build(tf.TensorShape([1, None]))

In [44]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

In [45]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [46]:
print(generate_text(model, start_string=u"ROMEO: "))

InvalidArgumentError: ignored

In [None]:
for i in data.take(1):
  out = model(i)
  prob_out = tf.nn.softmax(out)
  for i in prob_out:
    index = np.random.choice(a=vocab_size, size=1, p=prob_out.numpy()[0])
    for char in vocab.keys():
      if vocab[char] == index:
        output_string += char
  print(output_string)
  