In [8]:
from __future__ import absolute_import, division, print_function
import chainer as ch
from chainer import functions as F
from chainer import links as L 
from chainer import initializers as init
from chainer.datasets import TupleDataset
from chainer import serializers
import numpy as np
import random
import time
import datetime
import LoggerYN as YN
from data import load_imdb 
import chainer
import chainer as ch
from chainer.dataset import Iterator
from data import load_ptb, load_ptb_vocab


In [4]:
def run_imdb(n_epochs):
    

    # save np.load
    np_load_old = np.load

    # modify the default parameters of np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

    # call load_data with allow_pickle implicitly set to true


    # In[3]:


    class ImdbDataset(TupleDataset):

        def __init__(self, train, vocabulary_size, seq_len):
            x, y = load_imdb(train, vocabulary_size, seq_len)

            lens = [len(xi) for xi in x]
            x = F.pad_sequence([np.array(xi) for xi in x], length=seq_len)
            super().__init__(x, lens, y)

    def collate_sequences(batch):
        sorted_batch = sorted(batch, key=lambda elem: elem[1], reverse=True)
        sequences, lengths, labels = zip(*sorted_batch)
        sequences = F.stack(sequences)
        labels = np.array(labels)
        sequences.to_gpu()
        labels= Variable(cuda.to_gpu(labels))
        return (sequences, lengths), labels


    # In[4]:


    class ImdbLstm(ch.Chain):

        def __init__(self, vocabulary_size, embedding_dim, hidden_size):
            super().__init__()
            with self.init_scope():
                self.embed = L.EmbedID(in_size=vocabulary_size, out_size=embedding_dim,
                                       initialW=init.Uniform(1.0))
                self.lstm = L.LSTM(in_size=embedding_dim, out_size=hidden_size,
                                   upward_init=init.GlorotUniform(),
                                   lateral_init=init.Orthogonal())
                self.fc = L.Linear(in_size=hidden_size, out_size=1,
                                   initialW=init.GlorotUniform())


        def forward(self, inputs):
            x, lens = inputs
            x = self.embed(x)
            x = [xi[:l] for xi, l in zip(x, lens)]
            x = F.transpose_sequence(x)
            self.lstm.reset_state()
            h = []
            for xt in x:
                ht = self.lstm(xt)
                h.append(ht)
            h = F.transpose_sequence(h)
            h = F.stack([hi[-1] for hi in h])
            f = self.fc(h)
            return F.flatten(f)


    # In[5]:


    def imdb_train(model, data_iter, criterion, optimizer, epoch, print_every=150):
        i = 0
        losses = []
        data_iter.reset()
        while not data_iter.is_new_epoch:
            inputs, labels = collate_sequences(data_iter.next())

            model.cleargrads()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.update()
            loss.to_cpu()
            losses.append(loss.array)
            if (i + 1) % print_every == 0:
                print('[%d, %5d] train loss: %.3f' % (epoch, i + 1, np.mean(losses)))
                print('[%d, %5d] train  acc: %.3f' % (epoch, i + 1, correct/total))
                losses = []
                count=0
                total=0
            i += 1


    def imdb_test(model, data_iter, criterion, epoch):  
        losses = []
        correct, total = 0, 0
        with ch.no_backprop_mode():
            data_iter.reset()
            while not data_iter.is_new_epoch:
                inputs, labels = collate_sequences(data_iter.next())

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.to_cpu()
                losses.append(loss.array)
                outputs.to_cpu()
                labels.to_cpu()
                preds = ((outputs.array >= 0.0)==labels.array)
                #print(preds)
                correct += preds.sum()
                total += preds.shape[0] 

        print("\n")
        print('[%d] test loss: %.3f' % (epoch, np.mean(losses)))
        print('[%d] accuracy: %.3f' % (epoch, correct / total * 100))
        print("\n\n")




    def imdb_run(n_epochs,vocabulary_size, seq_len,batch_size,embedding_size ,hidden_size):
        random.seed(1)
        np.random.seed(1)



        train_dataset = ImdbDataset(train=True, vocabulary_size=vocabulary_size, seq_len=seq_len)
        test_dataset = ImdbDataset(train=False, vocabulary_size=vocabulary_size, seq_len=seq_len)
        train_iter = ch.iterators.SerialIterator(train_dataset, batch_size=batch_size)
        test_iter = ch.iterators.SerialIterator(test_dataset, batch_size=batch_size, shuffle=False)

        model = ImdbLstm(vocabulary_size, embedding_size, hidden_size)
        model.to_gpu()

        criterion = F.sigmoid_cross_entropy
        optimizer = ch.optimizers.Adam().setup(model)

        memT,cpuT,gpuT = YN.StartLogger("Chainer_GPU","IMDB")
        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        epoch=1

        while (time_consumed <= 86400 and epoch <= n_epochs):

            imdb_train(model, train_iter, criterion, optimizer, epoch)
            imdb_test(model, test_iter, criterion, epoch)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()
            if epoch % 10 == 0:
                serializers.save_npz('Chainer_GPU_IMDB_LSTM_model', model)

        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed ", str(datetime.timedelta(seconds=time_consumed)))



    imdb_run(n_epochs, vocabulary_size = 5000, seq_len = 500, batch_size = 64, embedding_size = 32, hidden_size = 100)



In [5]:
def run_manythings(n_epochs):
    


    # Converts the unicode file to ascii
    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')


    def preprocess_sentence(w):
        w = unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ." 
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.rstrip().strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    # 1. Remove the accents
    # 2. Clean the sentences
    # 3. Return word pairs in the format: [ENGLISH, SPANISH]
    def create_dataset(path):
        lines = open(path, encoding='UTF-8').read().strip().split('\n')

        word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:100000]]

        return word_pairs

        # This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
    # (e.g., 5 -> "dad") for each language,
    class LanguageIndex():
        def __init__(self, lang):
            self.lang = lang
            self.word2idx = {}
            self.idx2word = {}
            self.vocab = set()

            self.create_index()

        def create_index(self):
            for phrase in self.lang:
                self.vocab.update(phrase.split(' '))

            self.vocab = sorted(self.vocab)

            self.word2idx['<pad>'] = 0
            for index, word in enumerate(self.vocab):
                self.word2idx[word] = index + 1

            for word, index in self.word2idx.items():
                self.idx2word[index] = word

    def max_length(tensor):
        return max(len(t) for t in tensor)


    def load_dataset(path):
        # creating cleaned input, output pairs
        pairs = create_dataset(path)

        # index language using the class defined above    
        inp_lang = LanguageIndex(sp for en, sp in pairs)
        targ_lang = LanguageIndex(en for en, sp in pairs)

        # Vectorize the input and target languages

        # Spanish sentences
        input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]

        # English sentences
        target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]

        # Calculate max_length of input and output tensor
        # Here, we'll set those to the longest sentence in the dataset
        max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

        # Padding the input and output tensor to the maximum length
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                     maxlen=max_length_inp,
                                                                     padding='post')

        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                      maxlen=max_length_tar, 
                                                                      padding='post')

        return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar


    # In[3]:


    def create_db(path_to_file):
        input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file)
        # Creating training and validation sets using an 80-20 split
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2,random_state=42)
        vocab_inp_size = len(inp_lang.word2idx)
        vocab_tar_size = len(targ_lang.word2idx)
        return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ


    # In[4]:


    def loss_function(real, pred):
        correct=0
        real = F.flatten(real)
        y_pred=chainer.functions.argmax(pred, axis=1)
        correct = (y_pred.array == real.array).sum()
        return  [F.softmax_cross_entropy(pred, real),correct,real.size]


    # In[5]:


    class Model(chainer.Chain):
        def __init__(self,batch_sz,units,embedding_dim,vocab_enc,vocab_dec):
            super(Model, self).__init__()
            with self.init_scope():
                self.batch_sz = batch_sz
                self.units = units
                self.embedding_enc = L.EmbedID(in_size=vocab_enc, out_size=embedding_dim,initialW=init.Uniform(1.0))
                self.LSTM_enc = L.NStepLSTM(in_size=embedding_dim, out_size=units, n_layers=1, dropout=0.0)
                self.embedding_dec = L.EmbedID(in_size=vocab_dec, out_size=embedding_dim,initialW=init.Uniform(1.0))
                self.LSTM_dec = L.NStepLSTM(in_size=embedding_dim, out_size=units, n_layers=1, dropout=0.0)
                self.fc = L.Linear(units,vocab_dec,initialW=init.GlorotUniform())

        def forward(self, inp,targ):
            x = self.embedding_enc(inp)
            x= list(x)
            hy,cy,_  = self.LSTM_enc(None,None,x)
            x = self.embedding_dec(targ[:,:])
            x=list(x)
            _,_, output = self.LSTM_dec(hy,cy,x)
            output= F.concat(output, axis=0)
            predictions = self.fc(output)
            return predictions


    # In[6]:


    def train(model,train_iterator,optimizer):
        t_correct=0
        total=0
        for (batch, data) in enumerate(train_iterator):
            model.cleargrads()
            inp =np.array([x for x, _ in data],dtype ='int')
            targ =np.array([x for _, x in data],dtype ='int')
            #inp= (cuda.to_gpu(inp))
            targ = chainer.backends.cuda.to_gpu(targ, device=0)
            inp =  chainer.backends.cuda.to_gpu(inp, device=0)
            predictions = model(inp,targ)
            result = loss_function(targ[:,:], predictions)
            loss=result[0]
            t_correct+=result[1]
            total+=result[2]
            loss.backward()
            optimizer.update()
            if(batch % 300 ==0):
                print('\nBatch: {} loss: {}'.format(batch,loss))
                print('Batch: {} acc: {}'.format(batch,t_correct/total))
                sys.stdout.flush()

        train_iterator.reset()


    # In[7]:


    def test(model,test_iterator):
        total_loss = 0
        t_correct=0
        total=0
        for (batch, data) in enumerate(test_iterator):
            model.cleargrads()
            inp =np.array([x for x, _ in data],dtype ='int')
            targ =np.array([x for _, x in data],dtype ='int')
            targ = chainer.backends.cuda.to_gpu(targ, device=0)
            inp =  chainer.backends.cuda.to_gpu(inp, device=0)
            predictions = model(inp,targ)
            result = loss_function(targ[:,:], predictions)
            loss=result[0]
            t_correct+=result[1]
            total+=result[2]
            total_loss += loss.data
        print('\nPerplexity :',np.power(2,total_loss/batch))
        print('Test Loss :',total_loss)
        print('Test Acc :',t_correct/total)
        sys.stdout.flush()
        test_iterator.reset()



    # In[8]:


    def run(n_epochs,BATCH_SIZE, embedding_dim, units, n_epochs):
        path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', extract=True)
        path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ = create_db(path_to_file)
        BUFFER_SIZE = len(input_tensor_train)
        N_BATCH = BUFFER_SIZE//BATCH_SIZE
        train_samples = len(input_tensor_train)
        val_samples = len(input_tensor_val)
        # create dataset
        dataset_train = chainer.datasets.TupleDataset(input_tensor_train, target_tensor_train)
        train_iterator = chainer.iterators.SerialIterator(dataset_train, BATCH_SIZE, repeat=False, shuffle=False, order_sampler=None)
        dataset_test = chainer.datasets.TupleDataset(input_tensor_train, target_tensor_train)
        test_iterator = chainer.iterators.SerialIterator(dataset_test, BATCH_SIZE, repeat=False, shuffle=False, order_sampler=None)
        model = Model(BATCH_SIZE,units,embedding_dim,vocab_inp_size,vocab_tar_size)
        model.to_gpu()
        optimizer = chainer.optimizers.Adam(0.00001).setup(model)
        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        memT,cpuT,gpuT = YN.StartLogger("Chainer_GPU","Manythings")
        epoch=1
        while(epoch <= n_epochs and time_consumed <= 86400 ):
            print("Epoch", epoch )
            train(model,train_iterator,optimizer)
            test(model,test_iterator)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()
            if epoch % 10 == 0:
                serializers.save_npz('Chainer_GPU_ManyThings_LSTM_model', model)

        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed ", str(datetime.timedelta(seconds=time_consumed)))


    # In[ ]:


    run(n_epochs ,BATCH_SIZE = 128, embedding_dim = 256, units = 256)








In [6]:
def run_ptb(n_epochs):


    class PtbIterator(Iterator):

        def __init__(self, train, batch_size, seq_len, skip_step=5):
            self.data = load_ptb(train)
            self.batch_size = batch_size
            self.seq_len = seq_len
            self.skip_step = skip_step
            self.reset()

        def __next__(self):
            x = np.zeros((self.batch_size, self.seq_len), dtype=np.int32)
            y = np.zeros((self.batch_size, self.seq_len), dtype=np.int32)

            for i in range(self.batch_size):
                if self.cur_idx + self.seq_len + 1 >= len(self.data):
                    self.epoch += 1
                    self.is_new_epoch = True
                    self.cur_idx = 0
                x[i, :] = self.data[self.cur_idx:self.cur_idx+self.seq_len]
                y[i, :] = self.data[self.cur_idx+1:self.cur_idx+self.seq_len+1]
                self.cur_idx += self.skip_step
            return ch.Variable(x), F.flatten(y.T)

        def reset(self):
            self.epoch = 0
            self.is_new_epoch = False
            self.cur_idx = 0


    # In[3]:


    class PtbLstm(ch.Chain):

        def __init__(self, vocabulary_size, hidden_size, num_layers, dropout):
            super().__init__()
            self.dropout = dropout
            with self.init_scope():
                self.embed = L.EmbedID(in_size=vocabulary_size, out_size=hidden_size,
                                       initialW=init.Uniform(1.0))
                self.lstms = []
                for i in range(1, num_layers + 1):
                    lstm = L.LSTM(in_size=hidden_size, out_size=hidden_size,
                                  upward_init=init.GlorotUniform(),
                                  lateral_init=init.Orthogonal())
                    setattr(self, 'lstm_' + str(i), lstm)
                    self.lstms.append(lstm)   
                self.fc = L.Linear(in_size=hidden_size, out_size=vocabulary_size,
                                   initialW=init.GlorotUniform())

        def reset_state(self):
            for lstm in self.lstms:
                lstm.reset_state()

        def forward(self, x):
            x = self.embed(x.T)
            f = []
            self.reset_state()
            for xt in x:
                ht = xt
                for lstm in self.lstms:
                    ht = lstm(ht)
                ht = F.dropout(ht, self.dropout)
                ft = self.fc(ht)
                f.append(ft)
            f = F.concat(f, axis=0)
            return f


    # In[4]:


    def ptb_train(model, data_iter, criterion, optimizer, epoch, print_every=5000):
        i = 0
        losses = []
        data_iter.reset()
        t_correct=0
        total=0
        while not data_iter.is_new_epoch:
            inputs, labels = data_iter.next()
            inputs.to_gpu()
            labels.to_gpu()
            model.cleargrads()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.update()
            y_pred = chainer.functions.argmax(outputs, axis=1)
            t_correct += (y_pred.array == labels.array).sum()
            total += labels.size

            losses.append(loss.array.get())
            if (i + 1) % print_every == 0:
                print('[%d, %5d] train loss: %.3f' % (epoch, i + 1, np.mean(losses)))
                print('[%d, %5d] train accu: %.3f' % (epoch,i + 1, t_correct/total))
                sys.stdout.flush()

                losses = []
                t_correct=0
                total=0
            i += 1


    def ptb_test(model, data_iter, criterion, epoch):   
        losses = []
        t_correct=0
        total=0
        with ch.no_backprop_mode():
            data_iter.reset()
            while not data_iter.is_new_epoch:
                inputs, labels = data_iter.next()
                inputs.to_gpu()
                labels.to_gpu()

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                losses.append(loss.array.get())

                y_pred = chainer.functions.argmax(outputs, axis=1)
                t_correct += (y_pred.array == labels.array).sum()
                total += labels.size

        loss = np.mean(losses)
        perplexity = np.exp(loss)
        print('[%d] test loss: %.3f perplexity: %.3f' % (epoch, loss, perplexity))
        print('[%d] test accu: %.3f' % (epoch, t_correct/total))
        sys.stdout.flush()



    def ptb_run(n_epochs, hidden_size, batch_size, seq_len, dropout, num_layers):
        random.seed(1)
        np.random.seed(1)

        ptb_vocab = load_ptb_vocab()
        vocabulary_size = len(ptb_vocab)

        train_iter = PtbIterator(train=True, batch_size=batch_size, seq_len=seq_len)
        test_iter = PtbIterator(train=False, batch_size=batch_size, seq_len=seq_len)

        model = PtbLstm(vocabulary_size, hidden_size, num_layers, dropout)
        model.to_gpu()

        criterion = F.softmax_cross_entropy
        optimizer = ch.optimizers.AdaDelta().setup(model)

        memT,cpuT,gpuT = YN.StartLogger("Chainer_GPU", "PTB")
        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        epoch=1

        while(epoch <= n_epochs and time_consumed <= 86400 ):
            ptb_train(model, train_iter, criterion, optimizer, epoch)
            ptb_test(model, test_iter, criterion, epoch)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()
            if epoch % 10 == 0:
                serializers.save_npz('Chainer_GPU_PTB_model', model)    

        end = time.time()        
        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed ", str(datetime.timedelta(seconds=time_consumed)))



    ptb_run(n_epochs, hidden_size = 200, batch_size = 20, seq_len = 30, dropout = 0.5, num_layers = 2)


    # In[ ]:







In [7]:
#run_imdb(n_epochs=50)
#run_manythings(n_epochs=100)
#run_ptb(n_epochs=50)


KeyboardInterrupt: 