In [52]:
import keras
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras import initializers as init
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import LoggerYN as YN
import warnings
import tensorflow as tf
import unicodedata
import time
import datetime
from data import load_imdb 
import sys
from keras import optimizers as optim
from keras import utils
from sklearn.model_selection import train_test_split
import random
import time
import re
import os
import datetime
from keras.callbacks import ModelCheckpoint
from keras.models import Model
import sys
from data import load_ptb, load_ptb_vocab
from keras.layers import Input, LSTM, Embedding, Dense
from keras.callbacks import ModelCheckpoint





warnings.filterwarnings('ignore')




In [53]:

def run_imdb(n_epochs):
    
    np_load_old = np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
    
    def imdb_data(train, vocabulary_size, seq_len):
        x, y = load_imdb(train, vocabulary_size, seq_len)
        x = pad_sequences(x, maxlen=seq_len, padding='post')
        return x, y




    def imdb_lstm(vocabulary_size, embedding_size, seq_len, hidden_size):
        model = Sequential()
        model.add(Embedding(vocabulary_size, embedding_size, mask_zero=True, input_length=seq_len,
                            embeddings_initializer=init.RandomUniform(-1.0, 1.0)))
        model.add(LSTM(hidden_size,kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros'))
        model.add(Dense(1, activation='sigmoid'))
        return model


    # In[5]:


    class ImdbLogCallback(keras.callbacks.Callback):

        def __init__(self, test, print_every=50):
            self.x_test, self.y_test = test
            self.print_every = print_every

        def on_train_begin(self, logs={}):
            self.losses = []

        def on_epoch_begin(self, epoch, logs={}):
            self.epoch = epoch

        def on_batch_end(self, batch, logs={}):
            self.losses.append(logs.get('loss'))
            if (batch + 1) % self.print_every == 0:
                print('[%d, %5d] train loss: %.3f' % (self.epoch, batch + 1, np.mean(self.losses)))
                sys.stdout.flush()
                self.losses = []

        def on_epoch_end(self, epoch, logs={}):
            loss, accuracy = self.model.evaluate(self.x_test, self.y_test, verbose=2)
            print('[%d] test loss: %.3f accuracy: %.3f' % (self.epoch, loss, accuracy))
            sys.stdout.flush()



    def imdb_run(n_epochs, vocabulary_size, seq_len, batch_size, embedding_size, hidden_size):
        np.random.seed(1)


        x_train, y_train = imdb_data(train=True, vocabulary_size=vocabulary_size, seq_len=seq_len)
        x_test, y_test = imdb_data(train=False, vocabulary_size=vocabulary_size, seq_len=seq_len)

        model = imdb_lstm(vocabulary_size, embedding_size, seq_len, hidden_size)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        memT,cpuT,gpuT = YN.StartLogger("Keras_GPU","IMDB")
        start = time.time()

            # checkpoint
        filepath="/home/wahab/DLBench_Addl/IMDB/weights_GPU_Keras_IMDB.hdf5"
        checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=False,period=10 )
        #callbacks_list = [checkpoint]

        model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epochs, 
                  verbose=2, callbacks=[ImdbLogCallback((x_test, y_test)),checkpoint])

        end = time.time()

        YN.EndLogger(memT,cpuT,gpuT)
        print("total time", str(datetime.timedelta(seconds=end-start)))
        model.save_weights("model_GPU_IMDB_Keras.h5")
        sys.stdout.flush()

    imdb_run(n_epochs, vocabulary_size = 5000, seq_len = 500, batch_size = 64, embedding_size = 32, hidden_size = 100)








In [54]:

def run_ptb(n_epochs):


# In[2]:


class PtbGenerator:

    def __init__(self, train, batch_size, seq_len, vocabulary_size, skip_step=5):
        self.data = load_ptb(train)
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.vocabulary_size = vocabulary_size
        self.skip_step = skip_step
        self.cur_idx = 0
        
    def __len__(self):
        return (len(self.data) - self.seq_len - 1) // (self.skip_step * self.batch_size)
        
    def generate(self):
        x = np.zeros((self.batch_size, self.seq_len), dtype=np.int32)
        y = np.zeros((self.batch_size, self.seq_len, self.vocabulary_size), dtype=np.int32)
        while True:
            for i in range(self.batch_size):
                if self.cur_idx + self.seq_len >= len(self.data):
                    self.cur_idx = 0
                x[i, :] = self.data[self.cur_idx:self.cur_idx+self.seq_len]
                y_batch = self.data[self.cur_idx+1:self.cur_idx+self.seq_len+1]
                y[i, :, :] = utils.to_categorical(y_batch, num_classes=self.vocabulary_size)
                self.cur_idx += self.skip_step
            yield x, y


# In[3]:


def ptb_lstm(vocabulary_size, hidden_size, seq_len, num_layers, dropout):
    model = Sequential()
    model.add(Embedding(vocabulary_size, hidden_size, input_length=seq_len,
                        embeddings_initializer=init.RandomUniform(-1.0, 1.0)))
    for _ in range(num_layers):
        model.add(LSTM(hidden_size, return_sequences=True,kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros'))
    model.add(Dropout(dropout))
    model.add(Dense(vocabulary_size, activation='softmax'))
    return model


# In[4]:


class PtbLogCallback(keras.callbacks.Callback):
    
    def __init__(self, test_generator, print_every=4500):
        self.test_generator = test_generator
        self.print_every = print_every
    
    def on_train_begin(self, logs={}):
        self.losses = []
        
    def on_epoch_begin(self, epoch, logs={}):
        self.epoch = epoch
        
    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        if (batch + 1) % self.print_every == 0:
            print('[%d, %5d] train loss: %.3f' % (self.epoch, batch + 1, np.mean(self.losses)))
            self.losses = []
            sys.stdout.flush()

        
    def on_epoch_end(self, epoch, logs={}):
        loss,accuracy = self.model.evaluate_generator(self.test_generator.generate(), steps=len(self.test_generator), 
                                             verbose=2)
        print('[%d] test loss: %.3f perplexity: %.3f' % (self.epoch, loss, np.exp(loss)))
        print('[%d] accuracy: %.3f' % (self.epoch, accuracy))
        print("\n")
        sys.stdout.flush()



        
def ptb_run(n_epochs, hidden_size, batch_size, seq_len, dropout, num_layers):
    np.random.seed(1)
    
    ptb_vocab = load_ptb_vocab()
    vocabulary_size = len(ptb_vocab)
    
    train_generator = PtbGenerator(train=True, batch_size=batch_size,
                                   seq_len=seq_len, vocabulary_size=vocabulary_size)
    test_generator = PtbGenerator(train=False, batch_size=batch_size,
                                   seq_len=seq_len, vocabulary_size=vocabulary_size)
    
    model = ptb_lstm(vocabulary_size, hidden_size, seq_len, num_layers, dropout)
    model.compile(loss='categorical_crossentropy', optimizer=optim.Adadelta(lr=1.0, rho=0.95, epsilon=1e-6),metrics=['accuracy'])
    
    memT,cpuT,gpuT = YN.StartLogger("Keras_GPU", "PTB")
    start = time.time()
    filepath="/home/wahab/DLBench_Addl/Penn TreeBank/weights_GPU_Keras_ptb.hdf5"
    checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=False,period=5 )

    model.fit_generator(train_generator.generate(), steps_per_epoch=len(train_generator), 
                        epochs=n_epochs, verbose=2, callbacks=[PtbLogCallback(test_generator)])
    
    end = time.time()
    YN.EndLogger(memT,cpuT,gpuT)

    print(str(datetime.timedelta(seconds=end-start)))


# In[5]:



    ptb_run(n_epochs, hidden_size = 200, batch_size = 20, seq_len = 30, dropout = 0.5, num_layers = 2)






In [55]:
def run_manythings(n_epochs):


    # Converts the unicode file to ascii
    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')


    def preprocess_sentence(w):
        w = unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ." 
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.rstrip().strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    # 1. Remove the accents
    # 2. Clean the sentences
    # 3. Return word pairs in the format: [ENGLISH, SPANISH]
    def create_dataset(path):
        lines = open(path, encoding='UTF-8').read().strip().split('\n')

        word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:100000]]

        return word_pairs

        # This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
    # (e.g., 5 -> "dad") for each language,
    class LanguageIndex():
        def __init__(self, lang):
            self.lang = lang
            self.word2idx = {}
            self.idx2word = {}
            self.vocab = set()

            self.create_index()

        def create_index(self):
            for phrase in self.lang:
                self.vocab.update(phrase.split(' '))
            self.vocab = sorted(self.vocab)
            self.word2idx['<pad>'] = 0
            for index, word in enumerate(self.vocab):
                self.word2idx[word] = index + 1
            for word, index in self.word2idx.items():
                self.idx2word[index] = word
    def max_length(tensor):
        return max(len(t) for t in tensor)


    def load_dataset(path):
        # creating cleaned input, output pairs
        pairs = create_dataset(path)

        # index language using the class defined above    
        inp_lang = LanguageIndex(sp for en, sp in pairs)
        targ_lang = LanguageIndex(en for en, sp in pairs)

        # Vectorize the input and target languages

        # Spanish sentences
        input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]

        # English sentences
        target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]

        # Calculate max_length of input and output tensor
        # Here, we'll set those to the longest sentence in the dataset
        max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

        # Padding the input and output tensor to the maximum length
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                     maxlen=max_length_inp,
                                                                     padding='post')

        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                      maxlen=max_length_tar, 
                                                                      padding='post')

        return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar


    # In[3]:


    def create_db(path_to_file):
        input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file)
        # Creating training and validation sets using an 80-20 split
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2,random_state=42)
        vocab_inp_size = len(inp_lang.word2idx)
        vocab_tar_size = len(targ_lang.word2idx)
        return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ


    # In[4]:


    def generate_batch(X , y,batch_size,max_length_inp,max_length_targ,vocab_tar_size):
        ''' Generate a batch of data '''
        while True:
            for j in range(0, len(X), batch_size):
                encoder_input_data = np.zeros((batch_size, max_length_inp),dtype='float32')
                decoder_input_data = np.zeros((batch_size, max_length_targ),dtype='float32')
                decoder_target_data = np.zeros((batch_size, max_length_targ, vocab_tar_size),dtype='float32')
                for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                    for t, word in enumerate(input_text):
                        encoder_input_data[i, t] = word # encoder input seq
                    for t, word in enumerate(target_text):
                        if t<len(target_text)-1:
                            decoder_input_data[i, t] = word # decoder input seq
                        if t>0:
                            # decoder target sequence (one hot encoded)
                            # does not include the START_ token
                            # Offset by one timestep
                            decoder_target_data[i, t - 1, word] = 1.
                yield([encoder_input_data, decoder_input_data], decoder_target_data)


    # In[5]:


    def perplexity(y_true, y_pred):
        return np.power(2,keras.backend.mean(keras.losses.categorical_crossentropy(y_true, y_pred)))
    def create_model(embedding_dim,units,vocab_inp_size,vocab_tar_size):
        # Encoder
        encoder_inputs = Input(shape=(None,))
        enc_emb =  Embedding(vocab_inp_size, embedding_dim, mask_zero = True)(encoder_inputs)
        encoder_lstm = LSTM(units, return_state=True,kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros')
        encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
        encoder_states = [state_h, state_c]
        decoder_inputs = Input(shape=(None,))
        dec_emb_layer = Embedding(vocab_tar_size, embedding_dim, mask_zero = True)
        dec_emb = dec_emb_layer(decoder_inputs)
        decoder_lstm = LSTM(units, return_sequences=True, return_state=True,kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros')
        decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                             initial_state=encoder_states)
        decoder_dense = Dense(vocab_tar_size, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        return model


    # In[6]:


    def run(epochs, BATCH_SIZE,  embedding_dim, units):
        path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', extract=True)
        path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ = create_db(path_to_file)
        BUFFER_SIZE = len(input_tensor_train)
        N_BATCH = BUFFER_SIZE//BATCH_SIZE
        train_samples = len(input_tensor_train)
        val_samples = len(input_tensor_val)
        model = create_model(embedding_dim,units,vocab_inp_size,vocab_tar_size)
        opt = keras.optimizers.Adam(lr=0.0001)
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        memT,cpuT,gpuT = YN.StartLogger("Keras_GPU","Manythings")

        # checkpoint
        #checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=False,period=10 )
        #callbacks_list = [checkpoint]


        start = time.time()
        model.fit_generator(verbose =2,generator = generate_batch(input_tensor_train, target_tensor_train, BATCH_SIZE,max_length_inp,max_length_targ,vocab_tar_size),
                        steps_per_epoch = N_BATCH,epochs=epochs,
                        validation_data = generate_batch(input_tensor_val, target_tensor_val,BATCH_SIZE,max_length_inp,max_length_targ,vocab_tar_size),
                        validation_steps = val_samples//BATCH_SIZE)
        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("total time", str(datetime.timedelta(seconds=end-start)))
        model.save_weights("Keras_GPU_model.h5")
        sys.stdout.flush()


    run(n_epochs,BATCH_SIZE = 128,  embedding_dim = 256, units = 256 )




In [56]:
#run_imdb(n_epochs=50)
#run_ptb(n_epochs=50)
#run_manythings(n_epochs=50)

Epoch 1/50
  5/625 [..............................] - ETA: 21:48 - loss: 9.2850 - perplexity: 426.9775 - acc: 0.0061   

KeyboardInterrupt: 