# Text generator based on RNN

## Brief
Generate fake abstract with RNN model under tensorflow r1.3.

### Import libraries

In [1]:
import tensorflow as tf
import numpy as np
import random
import os

### Configurations 

In [2]:
vocab = (" $%'()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            "\\^_abcdefghijklmnopqrstuvwxyz{|}\n")
graph_path=r"./graphs"
test_text_path=os.path.normpath(r"../DataSet/arvix_abstracts.txt")
batch_size=1
model_param_path=os.path.normpath(r"./model_checkpoints")

### Data encoding
#### Basic Assumption

* A full string sequence consists $START$ & $STOP$ signal with characters in the middle. 

#### Encoding policy
* A set $\mathcal{S}$ that consists of many characters is utilized to encode the characters.
* The $1^{st}$ entry of the vector corresponds to $UNKNOWN$ characters(l.e. characters that are beyond $\mathcal{S}$). 
* The last entry of the vector corresponds to $STOP$ signal of the sequence. 
* The entries in the middle corresponds to the indices of the characters within $\mathcal{S}$. 
* The $START$ signal is represented as a zero vector. 

In [3]:
class TextCodec:
    def __init__(self, vocab):
        self.__vocab = vocab
        
    def encode(self, string, sess = None, start=True, stop=True):
        """
        Encode string.
        Each character is represented as a N-dimension one hot vector. 
        N = len(self.__vocab)+ 2
        
        Note:
        The first entry of the vector corresponds to unknown character. 
        The last entry of the vector corresponds to STOP signal of the sequence. 
        The entries in the middle corresponds to the index of the character. 
        The START signal is represented as a zero vector. 
        """
        tensor=[vocab.find(ch)+1 for ch in string]
        if stop:
             tensor.append(len(vocab)+1)  # String + STOP
        tensor=tf.one_hot(tensor,depth=len(vocab)+2,on_value=1.0,off_value=0.0,axis=-1, dtype=tf.float32)
        if start:
            tensor=tf.concat([tf.zeros([1, len(vocab)+2],dtype=tf.float32),tensor],axis=0)  # String + START
        if sess is None:
            with tf.Session() as sess:
                nparray=tensor.eval()
        elif type(sess) == tf.Session:
            nparray = tensor.eval(session=sess)
        else:
            raise TypeError('"sess" must be {}, got {}'.format(tf.Session, type(sess)))    
        return nparray

    def decode(self, nparray, default="[UNKNOWN]",start="[START]",stop="[STOP]",strip=False):
        text_list=[]
        indices=np.argmax(nparray,axis=1)
        for v, ch_i in zip(nparray,indices):
            if np.all(v==0):
                text_list.append(start if not strip else "")
            elif ch_i==0:
                text_list.append(default)
            elif ch_i==len(self.__vocab)+1:
                text_list.append(stop if not strip else "")
            else:
                text_list.append(vocab[ch_i-1])
        return "".join(text_list)

### Test codec

In [4]:
test_codec=TextCodec(vocab)
test_text_encoded=test_codec.encode("Hello world!")
print("Encoded text looks like:\n{}".format(test_text_encoded))
test_text_decoded=test_codec.decode(nparray=test_text_encoded,strip=False)
print("Decoded text looks like:\n{}".format(test_text_decoded))

Encoded text looks like:
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]]
Decoded text looks like:
[START]Hello world[UNKNOWN][STOP]


### Define batch generator

In [5]:
def batch_generator(file_path, batch_size, sequence_length,overlap=0.5):
    step_size = int(sequence_length * (1-overlap))
    with open(os.path.normpath(file_path),"r") as f:
        text = "".join(f.readlines())
    batch = []
    for i in range(0,len(text)-sequence_length+1,step_size):
        batch.append(text[i:i+sequence_length])
        if len(batch) == batch_size:
            yield batch
            batch = []

In [6]:
i=0
for batch in batch_generator(test_text_path,2,30,0.5):
    print(batch)
    i+=1
    if i == 5:
        break

['In science and engineering, in', 'engineering, intelligent proce']
['telligent processing of comple', 'ssing of complex signals such ']
['x signals such as images, soun', 'as images, sound or language i']
['d or language is often perform', 's often performed by a paramet']
['ed by a parameterized hierarch', 'erized hierarchy of nonlinear ']


### Define model class

In [7]:
class MyRNN():
    def __init__(self, input_vector_dim, model_checkpoint_path, state_size=100, scope="RNN", output_vector_dim=None):
        if output_vector_dim is None:
            output_vector_dim=input_vector_dim
        self.__input_vector_dim=input_vector_dim
        self.__output_vector_dim=output_vector_dim
        self.__state_size=state_size
        self.__mdl_ckpt_path=model_checkpoint_path
        with tf.variable_scope(scope) as model_scope:
            with tf.variable_scope("structure") as structure_scope:
                # One batch at a time
                self.__inputs = tf.placeholder(dtype=tf.float32,shape=[None,None,input_vector_dim])
                self.__expected_outputs = tf.placeholder(dtype=tf.float32,shape=[None,None,output_vector_dim])
                self.__cell = tf.nn.rnn_cell.GRUCell(num_units=state_size)
                self.__init_state = tf.placeholder_with_default(tf.zeros([1,state_size],dtype=tf.float32),
                                                              [None,state_size])
                self.__wI2S = tf.get_variable(name="Weight_i2s",shape=[input_vector_dim,state_size],
                                            initializer=tf.truncated_normal_initializer())
                self.__bI2S = tf.get_variable(name="Bias_i2s",shape=[1,state_size],
                            initializer=tf.truncated_normal_initializer())
                self.__rnn_inputs=tf.add(tf.einsum("aij,jk->aik",self.__inputs,self.__wI2S), self.__bI2S)  
                # Didn't know why the broadcast mechanism work on tf.matmul(self.__inputs,self.__wI2S)
                
                # S20:State to output
                self.__wS2O = tf.get_variable(name="Weight_s2o",shape=[state_size,output_vector_dim],
                                            initializer=tf.truncated_normal_initializer())
                self.__bS2O = tf.get_variable(name="Bias_s2o",shape=[1,output_vector_dim],
                                            initializer=tf.truncated_normal_initializer())
                self.__state_output, self.final_state = tf.nn.dynamic_rnn(cell=self.__cell,inputs=self.__rnn_inputs,
                                                     dtype=tf.float32,initial_state=self.init_state)
                
                self.net_outputs=tf.add(tf.einsum("aij,jk->aik",self.__state_output,self.__wS2O), self.__bS2O)
                print(self.net_outputs.shape)
                # Same reason for usint tf.einsum here.
            with tf.name_scope("training"):
                # Need to utilize new loss function
                tf.softmax_cross_entropy
                loss=tf.losses.softmax_cross_entropy(logits=self.net_outputs,onehot_labels=self.outputs)
                loss=tf.reduce_mean(loss,name="loss")
                self.__global_step=tf.Variable(0,dtype=tf.int32,trainable=False,name='globalStep')
                self.optimizer=tf.train.AdamOptimizer(learning_rate=0.001,
                                                      name="optimizer").minimize(loss,global_step=self.__global_step)
            with tf.name_scope("summary") as sumScope:
                lossSum=tf.summary.scalar(tensor=loss,name="loss")
                self.summary_op=tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES,sumScope))

    def online_inference(self,timeSteps=100,state=None,seed=None,modelParamPath=None):
        feedDict={}
        saver=tf.train.Saver()
        if model_param_path is None:
            model_param_path=self.__mdl_ckpt_path
        if state is not None:
            feedDict[self.init_state]=state
        if seed is None:
            #Create a random seed
            seed=tf.one_hot([random.randint(0,self.__input_vector_dim)],
                            depth=self.__input_vector_dim, on_value=1.0, off_value=0.0, axis=-1, dtype=tf.float32)
            with tf.Session() as sess:
                seed=sess.run(seed)
        feed_dict[self.__inputs]=seed
        seq = []
        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(self.__mdl_ckpt_path)
            print("Loading model")
            if ckpt and ckpt.model_checkpoint_path:  
                saver.restore(sess,ckpt.model_checkpoint_path)
                print("Successfully loaded")
            for length in range(timeSteps):
                input_,state=sess.run(fetches=[self.netOutputs,self.finalState],
                                      feed_dict=feedDict)
                #  Since the input vectors are discrete, so we need to convert it into a
                #  new one hot vector
                input_=tf.one_hot(indices=[int(np.argmax(input_[0]))],depth=self.__outputVetorDim,
                                  on_value=1.0,off_value=0.0,axis=-1,dtype=tf.float32).eval()
                seq.append(input_)
                feedDict[self.inputs],feedDict[self.initState]=input_,state
            seq=tf.concat(values=seq,axis=0).eval()
        return seq,state
    
    @property
    def input_vector_dim(self):
        return self.__input_vector_dim
    
    @property
    def output_vector_dim(self):
        return self.__output_vector_dim
    
    @property
    def global_step(self):
        return self.__global_step
    
    @property
    def inputs(self):
        return self.__inputs
    
    @property
    def expected_outputs(self):
        return self.__expected_outputs
    
    @property
    def init_state(self):
        return self.__init_state
    
    @classmethod
    def train(cls, model, n_epochs, batch_generator, model_param_path, graph_path):  # To be fixed.
        with tf.Session() as sess:
            writer=tf.summary.FileWriter(logdir=graph_path,graph=sess.graph)
            saver=tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            ckpt=tf.train.get_checkpoint_state(model_param_path)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess,ckpt.model_checkpoint_path)
                iteration=Test.globalStep.eval()
                print("Successfully loaded session at iteration {}. Continue training now. ".format(iteration))
            else:
                iteration=0
            for i in range(n_epochs):
                with open(testTextPath,'r') as fileObj:
                    feed_dict={}
                    for line in fileObj:
                        sample=line.strip()
                        if len(sample)<2:
                            continue
                        X=Encode(sample) 
                        Y=X[1:]
                        X=X[:-1]
                        iteration+=1
                        feed_dict[Test.inputs]=X
                        feed_dict[Test.outputs]=Y
                        _,summary,state=sess.run([model.optimizer,model.summary_op,model.final_state],feed_dict=feed_dict)
                        if (iteration+1)%50==0:
                            writer.add_summary(summary,global_step=iteration)
                            saver.save(sess=sess,save_path=model_param_path+r"\rnnMDL",global_step=iteration)
                saver.save(sess=sess,save_path=modelParamPath+r"\rnnMDL",global_step=iteration)
                result=sess.run(Test.net_outputs,{Test.inputs:X}) #  See what happens if fed a zero state
            print("Session terminated. ")
            writer.close()
            saver.save(sess=sess,save_path=model_param_path+r"\rnnMDL", global_step=model.global_step)

### Make an instance

In [8]:
test_rnn=MyRNN(scope="RNN",input_vector_dim=len(vocab)+2,model_checkpoint_path=model_param_path,state_size=200)
del test_rnn

(?, ?, 86)


AttributeError: 'MyRNN' object has no attribute 'outputs'

### Training

In [None]:
n_epochs=6
MyRNN.train()

### Test online inference

In [None]:
result,_=Test.onlineInference(timeSteps=500)
print(Decode(result))

### Evaluate transition performance of the model

In [None]:
with tf.Session() as sess:
    print("Loading model")
    ckpt=tf.train.get_checkpoint_state(modelParamPath)
    if ckpt and ckpt.model_checkpoint_path:  
        saver.restore(sess,ckpt.model_checkpoint_path)
        print("Successfully loaded")
    result=sess.run(Test.netOutputs,{Test.inputs:X})
print(Decode(result))