In [14]:
from __future__ import absolute_import, division, print_function

import numpy as np
import random
import time
import datetime
import LoggerYN as YN
from data import load_imdb 
import chainer as ch
from data import load_ptb, load_ptb_vocab
import tensorflow as tf
import theano
import theano.tensor as T
import lasagne
from lasagne.layers import InputLayer, EmbeddingLayer, LSTMLayer, Gate, SliceLayer, DenseLayer, flatten
from lasagne import init
import os
import unicodedata

import sys

In [15]:
def run_imdb(n_epochs):

    # save np.load
    np_load_old = np.load

    # modify the default parameters of np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)


    # In[3]:


    def imdb_dataset(train, vocabulary_size, seq_len):
        x, y = load_imdb(train, vocabulary_size, seq_len)
        x = np.array([np.pad(xi, (0, seq_len - len(xi)), 'constant') for xi in x], dtype=np.int32)
        mask = (x != 0).astype(np.bool)
        y = np.array(y, dtype=np.int32)
        return x, mask, y

    def imdb_generator(data, batch_size, shuffle=False):
        x, mask, y = data
        indices = np.arange(len(y))
        if shuffle:
            np.random.shuffle(indices)

        for idx in range(0, len(y) - batch_size + 1, batch_size):
            batch_indices = indices[idx:idx + batch_size]
            yield x[batch_indices], mask[batch_indices], y[batch_indices]


    # In[4]:


    def imdb_lstm(input_var, mask_var, seq_len, vocabulary_size, embedding_size, hidden_size):
        l_input = lasagne.layers.InputLayer(shape=(None, seq_len), input_var=input_var)
        l_mask = lasagne.layers.InputLayer(shape=(None, seq_len), input_var=mask_var)
        l_embed = EmbeddingLayer(l_input, vocabulary_size, embedding_size,
                                 W=init.Uniform(1.0))
        l_lstm = LSTMLayer(l_embed, hidden_size, mask_input=l_mask,
                           ingate=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()),
                           forgetgate=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), b=init.Constant(1.0)),
                           cell=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), W_cell=None, 
                                     nonlinearity=lasagne.nonlinearities.tanh), 
                           outgate=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal())) 
        l_lstm_last = SliceLayer(l_lstm, -1, axis=1)
        l_out = DenseLayer(l_lstm_last, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid)
        return lasagne.layers.flatten(l_out, outdim=1)


    # In[5]:


    def imdb_train(model, data_gen, func, epoch, print_every=50):
        losses = []
        for i, (inputs, mask, labels) in enumerate(data_gen()):
            loss = func(inputs, mask, labels)

            losses.append(loss)
            if (i + 1) % print_every == 0:
                print('[%d, %5d] train loss: %.3f' % (epoch, i + 1, np.mean(losses)))
                sys.stdout.flush()
                losses = []


    def imdb_test(model, data_gen, func, epoch):
        losses = []
        correct, total = 0, 0
        for i, (inputs, mask, labels) in enumerate(data_gen()):
            loss, preds = func(inputs, mask, labels)

            losses.append(loss)
            correct += preds.sum().item()
            total += preds.shape[0]
        print('[%d] test loss: %.3f accuracy: %.3f' % (epoch, np.mean(losses), correct / total * 100))
        sys.stdout.flush()

    def imdb_run(n_epochs, vocabulary_size, seq_len, batch_size, embedding_size, hidden_size):
        np.random.seed(1)


        train_data = imdb_dataset(True, vocabulary_size, seq_len)
        test_data = imdb_dataset(False, vocabulary_size, seq_len)
        train_gen = lambda: imdb_generator(train_data, batch_size, shuffle=True)
        test_gen = lambda: imdb_generator(test_data, batch_size, shuffle=False)

        input_var = T.imatrix('inputs')
        mask_var = T.matrix('mask')
        labels_var = T.ivector('labels')
        variables = [input_var, mask_var, labels_var]
        model = imdb_lstm(input_var, mask_var, seq_len, vocabulary_size, embedding_size, hidden_size)

        preds = lasagne.layers.get_output(model)
        loss = lasagne.objectives.binary_crossentropy(preds, labels_var).mean()
        correct = T.eq(preds >= .5, labels_var)

        params = lasagne.layers.get_all_params(model, trainable=True)
        updates = lasagne.updates.adam(loss, params)
        train_func = theano.function(variables, loss, updates=updates)
        test_func = theano.function(variables, [loss, correct])


        memT,cpuT,gpuT = YN.StartLogger("Theano","IMDB")   
        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        epoch=1

        while (time_consumed <= 86400 and epoch <= n_epochs):
            imdb_train(model, train_gen, train_func, epoch)
            imdb_test(model, test_gen, test_func, epoch)
            time_consumed=(time.time())-start
            print("Time since beginning ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()
            epoch += 1
        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed ", str(datetime.timedelta(seconds=time_consumed)))
        sys.stdout.flush()
        f = open('Theano_CPU_IMDB_LSTM_model', 'wb')
        cPickle.dump(model, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()    


    # In[6]:


    imdb_run(n_epochs, vocabulary_size=5000, seq_len=500, batch_size=64, embedding_size=32, hidden_size=100) 







In [16]:
def run_ptb(n_epochs):
    
    class PtbIterator:

        def __init__(self, train, batch_size, seq_len, skip_step=5):
            self.data = load_ptb(train)
            self.batch_size = batch_size
            self.seq_len = seq_len
            self.skip_step = skip_step
            self.reset()

        def __iter__(self):
            self.reset()
            return self

        def __next__(self):
            x = np.zeros((self.batch_size, self.seq_len), dtype=np.int32)
            y = np.zeros((self.batch_size, self.seq_len), dtype=np.int32)

            for i in range(self.batch_size):
                if self.cur_idx + self.seq_len >= len(self.data):
                    raise StopIteration
                x[i, :] = self.data[self.cur_idx:self.cur_idx+self.seq_len]
                y[i, :] = self.data[self.cur_idx+1:self.cur_idx+self.seq_len+1]
                self.cur_idx += self.skip_step

            return x, y.ravel()

        def reset(self):
            self.cur_idx = 0


    # In[3]:


    def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers, dropout, batch_size):
        l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var)
        l_embed = L.EmbeddingLayer(l_input, vocabulary_size, hidden_size,
                                   W=init.Uniform(1.0))
        l_lstms = []
        for i in range(num_layers):
            l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1], hidden_size,
                                 ingate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()),
                                 forgetgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), 
                                                   b=init.Constant(1.0)),
                                 cell=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), W_cell=None, 
                                             nonlinearity=lasagne.nonlinearities.tanh), 
                                 outgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()))
            l_lstms.append(l_lstm)
        l_drop = L.DropoutLayer(l_lstms[-1], dropout)
        l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2)
        l_out = L.ReshapeLayer(l_out, (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2]))
        l_out = L.NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax)
        return l_out


    # In[4]:


    def ptb_train(model, data_iter, func, epoch, print_every=50):
        losses = []
        for i, (inputs, labels) in enumerate(data_iter):
            loss = func(inputs, labels)

            losses.append(loss)
            if (i + 1) % print_every == 0:
                print('[%d, %5d] train loss: %.3f' % (epoch, i + 1, np.mean(losses)))
                losses = []
            sys.stdout.flush()



    def ptb_test(model, data_iter, func, epoch):
        losses = []
        t_acc=0
        count=0
        for inputs, labels in data_iter:
            loss,acc = func(inputs, labels)
            losses.append(loss)
            t_acc += acc
            count += 1

        loss = np.mean(losses)
        perplexity = np.exp(loss)
        print('[%d] test loss: %.3f accuracy: %.3f' % (epoch, np.mean(losses), t_acc/count))
        sys.stdout.flush()



    def ptb_run(n_epochs,hidden_size, batch_size, seq_len, dropout, num_layers):
        np.random.seed(1)

        ptb_vocab = load_ptb_vocab()
        vocabulary_size = len(ptb_vocab)

        train_iter = PtbIterator(train=True, batch_size=batch_size, seq_len=seq_len)
        test_iter = PtbIterator(train=True, batch_size=batch_size, seq_len=seq_len)

        input_var = T.imatrix('inputs')
        labels_var = T.ivector('labels')
        variables = [input_var, labels_var]
        model = ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers, dropout, batch_size)

        preds = lasagne.layers.get_output(model)
        loss = lasagne.objectives.categorical_crossentropy(preds, labels_var).mean()
        test_acc = T.mean(T.eq(T.argmax(preds, axis=1), labels_var),
                      dtype=theano.config.floatX)

        params = lasagne.layers.get_all_params(model, trainable=True)
        updates = lasagne.updates.adadelta(loss, params)
        train_func = theano.function(variables, loss, updates=updates)
        test_func = theano.function(variables, [loss,test_acc])

        memT,cpuT,gpuT = YN.StartLogger("Theano_CPU", "PTB")   

        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        epoch=1

        while(epoch <= n_epochs and time_consumed <= 86400 ):
            ptb_train(model, train_iter, train_func, epoch)
            ptb_test(model, test_iter, test_func, epoch)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()


        f = open('Theano_CPU_IMDB_LSTM_model', 'wb')
        cPickle.dump(model, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()
        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed ", str(datetime.timedelta(seconds=time_consumed)))


    # In[ ]:


    ptb_run(n_epochs, hidden_size = 200, batch_size = 20, seq_len = 30, dropout = 0.5, num_layers = 2)


    # In[ ]:






In [17]:
def run_manythings(n_epochs):

    # Converts the unicode file to ascii
    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')


    def preprocess_sentence(w):
        w = unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ." 
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.rstrip().strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    # 1. Remove the accents
    # 2. Clean the sentences
    # 3. Return word pairs in the format: [ENGLISH, SPANISH]
    def create_dataset(path):
        lines = open(path, encoding='UTF-8').read().strip().split('\n')

        word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines]

        return word_pairs

        # This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
    # (e.g., 5 -> "dad") for each language,
    class LanguageIndex():
        def __init__(self, lang):
            self.lang = lang
            self.word2idx = {}
            self.idx2word = {}
            self.vocab = set()

            self.create_index()

        def create_index(self):
            for phrase in self.lang:
                self.vocab.update(phrase.split(' '))

            self.vocab = sorted(self.vocab)

            self.word2idx['<pad>'] = 0
            for index, word in enumerate(self.vocab):
                self.word2idx[word] = index + 1

            for word, index in self.word2idx.items():
                self.idx2word[index] = word
    def max_length(tensor):
        return max(len(t) for t in tensor)


    def load_dataset(path):
        # creating cleaned input, output pairs
        pairs = create_dataset(path)

        # index language using the class defined above    
        inp_lang = LanguageIndex(sp for en, sp in pairs)
        targ_lang = LanguageIndex(en for en, sp in pairs)

        # Vectorize the input and target languages

        # Spanish sentences
        input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]

        # English sentences
        target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]

        # Calculate max_length of input and output tensor
        # Here, we'll set those to the longest sentence in the dataset
        max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

        # Padding the input and output tensor to the maximum length
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                     maxlen=max_length_inp,
                                                                     padding='post')

        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                      maxlen=max_length_tar, 
                                                                      padding='post')

        return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar
    def acc_function(real,pred):
        print("real.shape",real.shape)
        print("pred.shape", pred.shape)
        return


    # In[3]:


    def create_db(path_to_file):
        input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file)
        # Creating training and validation sets using an 80-20 split
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2,random_state=42)
        vocab_inp_size = len(inp_lang.word2idx)
        vocab_tar_size = len(targ_lang.word2idx)
        return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ


    # In[7]:


    def model_run(vocab_size_enc,vocab_size_dec,embedding_dim,units,batch_sz,input_train,target_train,input_val, target_val,max_length_inp,max_length_targ,epochs):
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'
        input_enc = T.imatrix("input_enc")
        l_in = lasagne.layers.InputLayer(shape=(None,max_length_inp), input_var=input_enc)
        l_embed_enc = EmbeddingLayer(l_in, vocab_size_enc, output_size=embedding_dim,W=init.Uniform(1.0))
        l_lstm_enc = LSTMLayer(l_embed_enc,num_units=units,ingate=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()),
                           forgetgate=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), b=init.Constant(1.0)),
                           cell=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), W_cell=None, 
                                     nonlinearity=lasagne.nonlinearities.tanh), 
                           outgate=Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()))
        hidden_enc = SliceLayer(l_lstm_enc,indices = -1, axis=1)
        target_in = T.imatrix("traget_in")
        dec_in = lasagne.layers.InputLayer(shape=(None,max_length_targ), input_var=target_in)
        l_embed_dec =  EmbeddingLayer(dec_in, vocab_size_dec, output_size = embedding_dim)
        l_lstm_dec = LSTMLayer(l_embed_dec, num_units = units, hid_init = hidden_enc)
        dec_out = DenseLayer(l_lstm_dec,num_units = vocab_size_dec,num_leading_axes=-1)
        output = lasagne.layers.get_output(dec_out)
        output = T.reshape(output,(-1,vocab_size_dec))
        output = T.nnet.softmax(output)
        output_reshaped = T.flatten(target_in)

        loss = lasagne.objectives.categorical_crossentropy(output,output_reshaped).mean()

        test_acc = T.mean(T.eq(T.argmax(output, axis=1), output_reshaped),
                      dtype=theano.config.floatX)

        #accuracy = lasagne.objectives.categorical_accuracy(output,output_reshaped).mean()    
        params = lasagne.layers.get_all_params(dec_out, trainable=True)
        update = adam(loss,params,learning_rate=0.0001)

        train = theano.function([input_enc,target_in],loss, updates=update ,allow_input_downcast=True)
        test =theano.function([input_enc,target_in],[loss,test_acc],allow_input_downcast=True)


        instances = len(input_train)
        val_instances = len(input_val)
        batches_val = int(val_instances/batch_sz)
        batches = int(instances/batch_sz)
        memT,cpuT,gpuT = YN.StartLogger("Theano_GPU","Manythings")


        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        epoch=1

        while (epoch <= epochs):

            loss_val = 0
            t_acc=0
            count=0
            print("\n\nEpoch : ",epoch)

            for batch in range(batches):
                start = batch*batch_sz
                if(start + batch_sz > instances):
                    break
                inpu = input_train[start:start+batch_sz]
                targ = target_train[start:start +batch_sz]
                loss_train = train(inpu,targ)
                inpu = np.argmax(inpu,1) 
                targ = np.argmax(targ,1)

                if(batch % 300 == 0):
                    print("Batch {} Loss {}".format(batch,loss_train))
            loss_val = 0
            t_acc=0
            count=0
            for batch in range(batches_val):
                start = batch*batch_sz

                if(start + batch_sz > instances):
                    break
                inpu = input_val[start:start+batch_sz]
                targ = target_val[start:start +batch_sz]
                loss,acc = test(inpu,targ)
                loss_val +=loss
                inpu = np.argmax(inpu,1) 
                targ = np.argmax(targ,1) 
                t_acc += acc
                count += 1
            time_consumed=(time.time())-start
            print("Validation Perplexity :",np.power(2,loss_val/batch)) 
            print("Validation        Acc :",t_acc/count)
            epoch += 1
            sys.stdout.flush()




        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("\nTotal Time Consumed ", str(datetime.timedelta(seconds=end-start)))


    # In[8]:


    def run(BATCH_SIZE, embedding_dim, units, epochs):
        path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', extract=True)
        path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ = create_db(path_to_file)
        BUFFER_SIZE = len(input_tensor_train)
        N_BATCH = BUFFER_SIZE//BATCH_SIZE
        train_samples = len(input_tensor_train)
        val_samples = len(input_tensor_val)
        input_tensor_train = np.array(input_tensor_train,dtype='int')
        target_tensor_train = np.array(target_tensor_train,dtype='int')
        model_run(vocab_inp_size,vocab_tar_size,embedding_dim,units,BATCH_SIZE,input_tensor_train,target_tensor_train,input_tensor_val,target_tensor_val,max_length_inp, max_length_targ,epochs)


    # In[ ]:


    run(BATCH_SIZE = 128, embedding_dim = 256, units = 256, epochs = 100)


    # In[ ]:





# In[ ]:






In [18]:
#run_imdb(n_epochs=50)
#run_ptb(n_epochs=50)
run_manythings(n_epochs=100)

NameError: name 'unicodedata' is not defined