<a href="https://colab.research.google.com/github/AIKevin/Deep-Learning/blob/master/Language_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import time
import numpy as np
import tensorflow as tf

In [0]:
!mkdir data
!wget -q -O data/ptb.zip https://ibm.box.com/shared/static/z2yvmhbskc45xd2a9a4kkn6hg4g4kj5r.zip
!unzip -o data/ptb.zip -d data
!cp data/ptb/reader.py .
import reader

Archive:  data/ptb.zip
   creating: data/ptb/
  inflating: data/ptb/reader.py      
   creating: data/__MACOSX/
   creating: data/__MACOSX/ptb/
  inflating: data/__MACOSX/ptb/._reader.py  
  inflating: data/__MACOSX/._ptb     


In [0]:
!wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 
!tar xzf simple-examples.tgz -C data/

--2019-02-19 00:38:15--  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Resolving www.fit.vutbr.cz (www.fit.vutbr.cz)... 147.229.9.23, 2001:67c:1220:809::93e5:917
Connecting to www.fit.vutbr.cz (www.fit.vutbr.cz)|147.229.9.23|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34869662 (33M) [application/x-gtar]
Saving to: ‘simple-examples.tgz’

simple-examples.tgz   8%[>                   ]   2.85M   827KB/s    eta 42s    --2019-02-19 00:38:15--  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Resolving www.fit.vutbr.cz (www.fit.vutbr.cz)... 147.229.9.23, 2001:67c:1220:809::93e5:917
Connecting to www.fit.vutbr.cz (www.fit.vutbr.cz)|147.229.9.23|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34869662 (33M) [application/x-gtar]
Saving to: ‘simple-examples.tgz’

simple-examples.tgz   8%[>                   ]   2.85M   827KB/s    eta 42s    --2019-02-19 00:38:15--  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-exampl

In [0]:
hpm={
#Initial weight scale
'init_scale' : 0.1,
#Initial learning rate
'learning_rate' : 1.0,
#Maximum permissible norm for the gradient (For gradient clipping -- another measure against Exploding Gradients)
'max_grad_norm' : 5,
#The number of layers in our model
'num_layers' : 2,
#The total number of recurrence steps, also known as the number of layers when our RNN is "unfolded"
'num_steps' : 20,
#The number of processing units (neurons) in the hidden layers
'hidden_size' : 256,
#The maximum number of epochs trained with the initial learning rate
'max_epoch' : 8,
#The total number of epochs in training
'max_max_epoch' : 8,
#The probability for keeping data in the Dropout Layer (This is an optimization, but is outside our scope for this notebook!)
#At 1, we ignore the Dropout Layer wrapping.
'keep_prob' : 1,
#The decay for the learning rate
'decay' : 0.5,
#The size for each batch of data
'batch_size' : 30,
#The size of our vocabulary
'vocab_size' : 10000,
#Training flag to separate training from testing
'is_training' : 1,
#Data directory for our dataset
'data_dir' : "data/simple-examples/data/"
}

In [0]:
class PTBModel():

    def __init__(self, is_training, hpm):
      
      self.hpm=hpm
      #Placelholders
      self._input_data = tf.placeholder(tf.int32, [self.hpm['batch_size'], self.hpm['num_steps']], name='input_data') #[30#20]
      self._targets = tf.placeholder(tf.int32, [self.hpm['batch_size'], self.hpm['num_steps']], name='targets') #[30#20]
      
      #
      lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hpm['hidden_size'], forget_bias=0.0)
      
      
      if is_training and self.hpm['keep_prob'] < 1:
            lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.hpm['keep_prob'])
          
      stacked_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell] * self.hpm['num_layers'])
      self._initial_state = stacked_lstm.zero_state(self.hpm['batch_size'], tf.float32)
      with tf.device("/gpu:0"):
        embedding = tf.get_variable("embedding", [hpm['vocab_size'], self.hpm['hidden_size']])  #[10000x200]
        inputs = tf.nn.embedding_lookup(embedding, self._input_data)
        
      if is_training and self.hpm['keep_prob'] < 1:
            inputs = tf.nn.dropout(inputs, keep_prob)
          
      outputs, state = tf.nn.dynamic_rnn(stacked_lstm, inputs, initial_state=self._initial_state)
      output_id= tf.identity(outputs, name='output')
      output = tf.reshape(outputs, [-1, self.hpm['hidden_size']])
      softmax_w = tf.get_variable("softmax_w", [self.hpm['hidden_size'], self.hpm['vocab_size']]) #[200x1000]
      softmax_b = tf.get_variable("softmax_b", [self.hpm['vocab_size']]) #[1x1000]
      logits = tf.matmul(output, softmax_w) + softmax_b
      
      logits=tf.identity(logits, name= 'output_fc')
      
      loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])],
                                                      [tf.ones([self.hpm['batch_size'] * self.hpm['num_steps']])])
      self._cost = cost = tf.reduce_sum(loss) / self.hpm['batch_size']
      self._final_state = state

      #Everything after this point is relevant only for training
      if not is_training:
          return

      self._lr = tf.Variable(0.0, trainable=False)
      tvars = tf.trainable_variables()
      grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), self.hpm['max_grad_norm'])
      optimizer = tf.train.GradientDescentOptimizer(self._lr)
      self._train_op = optimizer.apply_gradients(zip(grads, tvars))
      
    def assign_lr(self, session, lr_value):
      session.run(tf.assign(self._lr, lr_value))

In [0]:
raw_data = reader.ptb_raw_data(hpm['data_dir'])
train_data, valid_data, test_data, _, _ = raw_data

In [0]:
def run_epoch(session, m, data, eval_op, verbose=False):

    #Define the epoch size based on the length of the data, batch size and the number of steps
    epoch_size = ((len(data) // m.hpm['batch_size']) - 1) // m.hpm['num_steps']
    start_time = time.time()
    costs = 0.0
    iters = 0
    #state = m.initial_state.eval()
    #m.initial_state = tf.convert_to_tensor(m.initial_state) 
    #state = m.initial_state.eval()
    state = session.run(m._initial_state)
    
    #For each step and data point
    for step, (x, y) in enumerate(reader.ptb_iterator(data, m.hpm['batch_size'], m.hpm['num_steps'])):
 
        #Evaluate and return cost, state by running cost, final_state and the function passed as parameter
        cost, state, _ = session.run([m._cost, m._final_state, eval_op],
                                     {m._input_data: x,
                                      m._targets: y,
                                      m._initial_state: state})
        
        #Add returned cost to costs (which keeps track of the total costs for this epoch)
        costs += cost
        
        #Add number of steps to iteration counter
        iters += m.hpm['num_steps']

        if verbose and step % (epoch_size // 10) == 10:
            print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, np.exp(costs / iters),
              iters * m.hpm['batch_size'] / (time.time() - start_time)))

    # Returns the Perplexity rating for us to keep track of how the model is evolving
    return np.exp(costs / iters)

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:

#Initializes the Execution Graph and the Session
with tf.Graph().as_default(), tf.Session() as session:
    initializer = tf.random_uniform_initializer(-hpm['init_scale'],hpm['init_scale'])
    
    # Instantiates the model for training
    # tf.variable_scope add a prefix to the variables created with tf.get_variable
    with tf.variable_scope("model", reuse=None, initializer=initializer):
        m = PTBModel(True, hpm)
    saver= tf.train.Saver()
    # Reuses the trained parameters for the validation and testing models
    # They are different instances but use the same variables for weights and biases, they just don't change when data is input
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        mvalid = PTBModel(False, hpm)
        mtest = PTBModel(False, hpm)

    #Initialize all variables
    tf.global_variables_initializer().run()

    for i in range(m.hpm['max_max_epoch']):
        # Define the decay for this epoch
        lr_decay = m.hpm['decay'] ** max(i - m.hpm['max_epoch'], 0.0)
        
        # Set the decayed learning rate as the learning rate for this epoch
        m.assign_lr(session, m.hpm['learning_rate'] * lr_decay)

        print("Epoch %d : Learning rate: %.3f" % (i + 1, session.run(m._lr)))
        
        # Run the loop for this epoch in the training model
        train_perplexity = run_epoch(session, m, train_data, m._train_op,
                                   verbose=True)
        print("Epoch %d : Train Perplexity: %.3f" % (i + 1, train_perplexity))
        
        # Run the loop for this epoch in the validation model
        valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op())
        print("Epoch %d : Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
    
    # Run the loop in the testing model to see how effective was our training
    test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
    
    print("Test Perplexity: %.3f" % test_perplexity)
    save_path= saver.save(session, "/gdrive/My Drive/Colab Notebooks/Checkpoints/LMmodel.ckpt")

Epoch 1 : Learning rate: 1.000
0.006 perplexity: 7494.461 speed: 10055 wps
0.106 perplexity: 1105.156 speed: 14202 wps
0.205 perplexity: 812.629 speed: 14301 wps
0.305 perplexity: 647.097 speed: 14356 wps
0.404 perplexity: 534.577 speed: 14385 wps
0.504 perplexity: 468.034 speed: 14408 wps
0.603 perplexity: 419.276 speed: 14411 wps
0.702 perplexity: 383.488 speed: 14417 wps
0.802 perplexity: 356.537 speed: 14417 wps
0.901 perplexity: 331.548 speed: 14417 wps
Epoch 1 : Train Perplexity: 312.051
Epoch 1 : Valid Perplexity: 186.312
Epoch 2 : Learning rate: 1.000
0.006 perplexity: 206.086 speed: 11974 wps
0.106 perplexity: 182.886 speed: 14178 wps
0.205 perplexity: 175.493 speed: 14266 wps
0.305 perplexity: 167.841 speed: 14317 wps
0.404 perplexity: 159.845 speed: 14334 wps
0.504 perplexity: 156.608 speed: 14350 wps
0.603 perplexity: 153.043 speed: 14364 wps
0.702 perplexity: 149.914 speed: 14371 wps
0.802 perplexity: 147.705 speed: 14383 wps
0.901 perplexity: 144.022 speed: 14391 wps
Epoc