In [1]:
from __future__ import print_function
import os
import numpy as np
import tensorflow as tf
import random 
import string
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename,expected_bytes):
    
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)   
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
                   'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename
filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()
text = read_data(filename)
print('Data size %d' % len(text))    

Data size 1000


In [4]:
valid_size = 100
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:60])
print(valid_size, valid_text[:60])

950 d against early working class radicals including the diggers of 
50  anarchism originated as a term of abuse first use


In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [6]:
batch_size=5
num_unrollings=7

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 2)

print(batches2string(train_batches.next()))
print('-'*80)
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))


['d agains', 'cribe an', 'reek wit', 'at this ', 'does not']
--------------------------------------------------------------------------------
[' an']
['nar']


In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

LSTM WITH BIGRAMS


In [8]:
#One layer LSTM model
num_nodes=90
embedding_size= 128

graph=tf.Graph()
with graph.as_default():
    #Input data
    train_data=list()
    for _ in range(num_unrollings+1):
        train_data.append(
                         tf.placeholder(tf.float32,shape=[batch_size,vocabulary_size]))
    
    train_chars=train_data[:num_unrollings]
    train_inputs=zip(train_chars[:-1], train_chars[1:])
    train_labels=train_data[2:]
        
 
    vocabulary_embeddings=tf.Variable(tf.random_uniform([vocabulary_size*vocabulary_size,embedding_size],
                                                       -1.0,1.0))
    
        
    #cell parameters
    #for inputgate
    ix = tf.Variable(tf.truncated_normal([embedding_size,num_nodes],-0.1,0.1))
    ih=tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    ib=tf.Variable(tf.zeros([1,num_nodes]))
    
    #for input state
    gx=tf.Variable(tf.truncated_normal([embedding_size,num_nodes],-0.1,0.1))
    gh=tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    gb=tf.Variable(tf.zeros([1,num_nodes]))
    
    #for forget Gate
    fx=tf.Variable(tf.truncated_normal([embedding_size,num_nodes],-0.1,0.1))
    fh=tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    fb=tf.Variable(tf.zeros([1,num_nodes]))
    
    #for output gate
    ox=tf.Variable(tf.truncated_normal([embedding_size,num_nodes],-0.1,0.1))
    oh=tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    ob=tf.Variable(tf.zeros([1,num_nodes]))
   
    #classifier parameteres
    w=tf.Variable(tf.truncated_normal([num_nodes,vocabulary_size],-0.1,0.1))
    b=tf.Variable(tf.zeros([vocabulary_size]))
    
    '''note that this LSTM architechture solves vanishing gradient problem via its amazing inner 
      properties of propagating error backward smoothly between many time steps.'''
    
    #lstm cell computation
    def lstm_cell(input,prev_h,prev_s):
        g=tf.tanh( tf.matmul(input,gx) + tf.matmul(prev_h,gh) + gb )
        input_gate=tf.sigmoid( tf.matmul(input,ix) + tf.matmul(prev_h,ih) + ib )
        
        forget_gate=tf.sigmoid( tf.matmul(input,fx) + tf.matmul(prev_h,fh) + fb )
        
        state= g*input_gate + forget_gate*prev_s
        
        output_gate=tf.sigmoid( tf.matmul(input,ox) + tf.matmul(prev_h,oh) + ob )
        
        output = tf.tanh(state) * output_gate
        return output,state
    
        
    #for saving states and output during unrolling.
    saved_value_output=tf.Variable(tf.zeros([batch_size,num_nodes]),trainable=False)
    saved_value_state=tf.Variable(tf.zeros([batch_size,num_nodes]),trainable=False)
     

        
    #unrolling LSTM
    outputs=list()  #for storing hiddenvalues(output) of lstm cell
    output=saved_value_output
    state=saved_value_state
    
    for i in train_inputs:
        bigram_index= tf.arg_max(i[0],dimension=1) + vocabulary_size* tf.arg_max(i[1],dimension=1)
        iembed=tf.nn.embedding_lookup(vocabulary_embeddings,bigram_index)
        
        '''dropout applied in LSTM only in depth(non recurrent connections)'''
        
        dropout_i=tf.nn.dropout(iembed,0.5)  #dropout in input
        
        output,state = lstm_cell(dropout_i,output,state)
        
        drop_output=tf.nn.dropout(output,0.5)   #dropout in depth output
        outputs.append(drop_output)
      
        
        
     #to save output and state so next sequeces can use it. 
    with tf.control_dependencies([saved_value_output.assign(output),
                                  saved_value_state.assign(state)]):
        
        logits=tf.nn.xw_plus_b(tf.concat(0,outputs),w,b)
        
        #Regularization
        regularizer=tf.nn.l2_loss(w) + tf.nn.l2_loss(b)
        
        loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,
                                                             tf.concat(0,train_labels)))
        loss+=5e-4*regularizer
        
    #Optimizer
    global_step=tf.Variable(0,trainable=False)
    starter_lr=10.0
    learning_rate=tf.train.exponential_decay(starter_lr,global_step,5000,0.1,
                                                staircase=True)
        
    optimizer=tf.train.GradientDescentOptimizer(learning_rate)
    gradients,v = zip(*optimizer.compute_gradients(loss))
        
    #to handle exploding gradient problem.
    gradients,_=tf.clip_by_global_norm(gradients,1.30)
        
    optimizer=optimizer.apply_gradients(zip(gradients,v),global_step=global_step)
        
        
    #for predictions
    train_preds=tf.nn.softmax(logits)
    
    #sample predictions
    
    sample_input=list()
    for i in range(2):
        sample_input.append(tf.placeholder(tf.float32,shape=[1,vocabulary_size]) )
        
    sample_biindex=tf.arg_max(sample_input[0],dimension=1) + vocabulary_size* tf.arg_max(sample_input[1],
                                                                                        dimension=1)
    
    sample_embed=tf.nn.embedding_lookup(vocabulary_embeddings,sample_biindex)
    
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    
    reset_sample_state = tf.group(
                   saved_sample_output.assign(tf.zeros([1, num_nodes])),
                         saved_sample_state.assign(tf.zeros([1, num_nodes])))
    
    sample_output,sample_state=lstm_cell(sample_embed,saved_sample_output,saved_sample_state)
    
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        valid_preds=tf.nn.softmax(tf.nn.xw_plus_b(sample_output,w,b))
    
       

In [9]:

import collections
num_steps = 100000
summary_frequency = 1000

valid_batches = BatchGenerator(valid_text, 1, 2)
with tf.Session(graph=graph) as se:
    tf.initialize_all_variables().run()
    
    mean_loss=0
    for step in range(num_steps):
        batche=train_batches.next()
        feed_dict=dict()
        for i in range(num_unrollings+1):
            feed_dict[train_data[i]]=batche[i]
        
        _,l,lr,predictions = se.run([optimizer,loss,learning_rate,train_preds],feed_dict=feed_dict)
        
        mean_loss+=l
        if step% summary_frequency ==0:
            
            if step>0:
                mean_loss=mean_loss/summary_frequency
                
            print(
                        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0    
            
            labels = np.concatenate(list(batche)[2:])
            
            print('Minibatch perplexity: %.2f' % float(
                   np.exp(logprob(predictions, labels))))
            
            if step% (summary_frequency*10) ==0:
                
                #to generate some sample sentences
                print('='*100)
                for _ in range(7):
                    feed = collections.deque(maxlen=2)
                    for _ in range(2):  
                        feed.append(random_distribution())
                        
                    sentence = characters(feed[0])[0] + characters(feed[1])[0]    
                    reset_sample_state.run()
                    
                    for _ in range(79):
                        prediction = valid_preds.eval({
                                   sample_input[0]: feed[0],
                                   sample_input[1]: feed[1]
                                                           })
                        feed.append(sample(prediction))
                        sentence += characters(feed[1])[0]
                        
                    print(sentence)    
                print('=' * 80)  
            
            #validation set perplexity
            reset_sample_state.run()
            valid_logprob = 0
            for i in range(valid_size):
                b=valid_batches.next()
                predictions = valid_preds.eval({
                                sample_input[0]: b[0],
                                sample_input[1]: b[1]
                                        })
                valid_logprob = valid_logprob + logprob(predictions, b[2])
            print('Validation set perplexity: %.2f' % float(np.exp(
                                   valid_logprob / valid_size)))    

         

Average loss at step 0: 3.305193 learning rate: 10.000000
Minibatch perplexity: 27.06
aslr hnvhregtmxruztovtnxaaptn i  ayiatllktc  mppdoarph i ayyjw wludtcyt ttptdtgi 
fkr xwat ah ztd n ntojaubtnanv  ju pqtiumgturvcmaoryekejhd lr pvoa guvpaeas xtac 
hwa lcnt ama ull somos uata or ntqcah ap lr lvta   jyutyaolauksgtomktkah s oje it
x h ttxtw qttqzsnhztjaat ttdtvit ayjc  ntr ctaorwxll p  aratp t  x  hbnyn ntsmt  
gkbht    o ma ftge zqpqtadhtmp  c  paeqtard rmkta wathjpmk ao eyc jg t i t ctdte 
gitndter axstib i hlq po sr g  lc ttqrl skrbra vwteovtssfpmba  etsas e ctz n wsg 
aleqq bq ab d rs aztaaquuatrta aa yreltr qtq watsmtjuij dtqsxwtodtvk y aatlsagtca
Validation set perplexity: 40.74
Average loss at step 100: 3.277808 learning rate: 10.000000
Minibatch perplexity: 17.58
Validation set perplexity: 27.95
Average loss at step 200: 3.140219 learning rate: 10.000000
Minibatch perplexity: 15.48
Validation set perplexity: 11.95
Average loss at step 300: 3.033390 learning rate: 10.000000
Minib