In [9]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.utils.data as utils
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import tensorflow as tf
import numpy as np
import LoggerYN as YN
import time
import sys
import datetime
from data import load_imdb 
from sklearn.model_selection import train_test_split
import unicodedata
import re
import os
from data import load_ptb, load_ptb_vocab



In [10]:
def run_imdb(n_epochs):

    np_load_old = np.load

    # modify the default parameters of np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)


    # In[3]:


    class ImdbDataset(Dataset):

        def __init__(self, train, vocabulary_size, seq_len):
            x, y = load_imdb(train, vocabulary_size, seq_len)

            self.lens = torch.LongTensor([len(xi) for xi in x])
            self.x = pad_sequence([torch.LongTensor(xi) for xi in x], batch_first=True)
            self.y = torch.FloatTensor(y)

        def __len__(self):
            return len(self.x)

        def __getitem__(self, idx):
            return self.x[idx], self.lens[idx], self.y[idx]

    def collate_sequences(batch):
        sorted_batch = sorted(batch, key=lambda elem: elem[1], reverse=True)
        sequences, lengths, labels = zip(*sorted_batch)
        sequences = torch.stack(sequences)
        lengths = torch.LongTensor(lengths)
        labels = torch.FloatTensor(labels)
        return (sequences, lengths), labels


    # In[4]:


    class ImdbLstm(nn.Module):

        def __init__(self, vocabulary_size, embedding_size, hidden_size):
            super().__init__()
            self.embed = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_size)
            nn.init.uniform_(self.embed.weight, -1.0, 1.0)

            self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)
            nn.init.xavier_uniform_(self.lstm.weight_ih_l0)
            nn.init.orthogonal_(self.lstm.weight_hh_l0)
            nn.init.constant_(self.lstm.bias_ih_l0, 0.0)
            nn.init.constant_(self.lstm.bias_ih_l0[hidden_size:2*hidden_size], 1.0)
            nn.init.constant_(self.lstm.bias_hh_l0, 0.0)

            self.fc = nn.Linear(in_features=hidden_size, out_features=1)
            nn.init.xavier_uniform_(self.fc.weight)
            nn.init.constant_(self.fc.bias, 0.0)


        def forward(self, inputs):
            x, lens = inputs
            x = self.embed(x)
            x = pack_padded_sequence(x, lens, batch_first=True)
            o, (h, c) = self.lstm(x)
            f = self.fc(h[-1]) 
            return torch.sigmoid(f).flatten()


    # In[5]:


    def imdb_train(model, data_loader, criterion, optimizer, epoch, print_every=50):
        model.train()

        losses = []
        for i, (inputs, labels) in enumerate(data_loader):
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())
            if (i + 1) % print_every == 0:
                print('[%d, %5d] train loss: %.3f' % (epoch, i + 1, np.mean(losses)))
                losses = []
                sys.stdout.flush()


    def imdb_test(model, data_loader, criterion, epoch):
        model.eval()

        losses = []
        correct, total = 0, 0
        with torch.no_grad():
            for inputs, labels in data_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                losses.append(loss.item())
                preds = (outputs >= 0.5).float() == labels
                correct += preds.sum().item()
                total += preds.size(0)

        print('[%d] test loss: %.3f accuracy: %.3f' % (epoch, np.mean(losses), correct / total * 100))
        sys.stdout.flush()


    def imdb_run(n_epochs, vocabulary_size, seq_len, batch_size, embedding_size, hidden_size):
        torch.manual_seed(1)


        train_dataset = ImdbDataset(train=True, vocabulary_size=vocabulary_size, seq_len=seq_len)
        test_dataset = ImdbDataset(train=False, vocabulary_size=vocabulary_size, seq_len=seq_len)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_sequences)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_sequences)

        model = ImdbLstm(vocabulary_size, embedding_size, hidden_size)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters())

        memT,cpuT,gpuT = YN.StartLogger("PyTorch_CPU","IMDB")

        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start
        epoch=0

        while (time_consumed <= 86400 and epoch <= n_epochs):
            print("\n\nEpoch ",epoch)
            imdb_train(model, train_loader, criterion, optimizer, epoch)
            imdb_test(model, test_loader, criterion, epoch)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()
            if epoch % 20 == 0:
                torch.save(model.state_dict(), 'Pytorch_GPU_IMDB_LSTM_model')



        end = time.time()
        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed ", str(datetime.timedelta(seconds=time_consumed)))


    # In[6]:


    imdb_run(n_epochs, vocabulary_size = 5000, seq_len = 500, batch_size = 64, embedding_size = 32, hidden_size = 100)


    # In[ ]:






In [11]:
def run_manythings(n_epochs):
    

    # Converts the unicode file to ascii
    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')


    def preprocess_sentence(w):
        w = unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ." 
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.rstrip().strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    # 1. Remove the accents
    # 2. Clean the sentences
    # 3. Return word pairs in the format: [ENGLISH, SPANISH]
    def create_dataset(path):
        lines = open(path, encoding='UTF-8').read().strip().split('\n')

        word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines]

        return word_pairs

        # This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
    # (e.g., 5 -> "dad") for each language,
    class LanguageIndex():
        def __init__(self, lang):
            self.lang = lang
            self.word2idx = {}
            self.idx2word = {}
            self.vocab = set()

            self.create_index()

        def create_index(self):
            for phrase in self.lang:
                self.vocab.update(phrase.split(' '))

            self.vocab = sorted(self.vocab)

            self.word2idx['<pad>'] = 0
            for index, word in enumerate(self.vocab):
                self.word2idx[word] = index + 1

            for word, index in self.word2idx.items():
                self.idx2word[index] = word

    def max_length(tensor):
        return max(len(t) for t in tensor)


    def load_dataset(path):
        # creating cleaned input, output pairs
        pairs = create_dataset(path)

        # index language using the class defined above    
        inp_lang = LanguageIndex(sp for en, sp in pairs)
        targ_lang = LanguageIndex(en for en, sp in pairs)

        # Vectorize the input and target languages

        # Spanish sentences
        input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]

        # English sentences
        target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]

        # Calculate max_length of input and output tensor
        # Here, we'll set those to the longest sentence in the dataset
        max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

        # Padding the input and output tensor to the maximum length
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                     maxlen=max_length_inp,
                                                                     padding='post')

        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                      maxlen=max_length_tar, 
                                                                      padding='post')

        return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar


    # In[14]:


    class Encoder(nn.Module):
        def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
            super(Encoder, self).__init__()
            self.batch_sz = batch_sz
            self.enc_units = enc_units
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
            self.LSTM = nn.LSTM(input_size =embedding_dim, hidden_size = self.enc_units, batch_first=True)
            nn.init.xavier_uniform_(self.LSTM.weight_ih_l0)
            nn.init.orthogonal_(self.LSTM.weight_hh_l0)
            nn.init.constant_(self.LSTM.bias_ih_l0, 0.0)
            nn.init.constant_(self.LSTM.bias_ih_l0[self.enc_units:2*self.enc_units], 1.0)
            nn.init.constant_(self.LSTM.bias_hh_l0, 0.0)

        def forward(self, x, hidden):
            x = self.embedding(x)
            output, state = self.LSTM(x) 
            return output, state

        def initialize_hidden_state(self):
            return torch.zeros((self.batch_sz,self.enc_units))
    class Decoder(nn.Module):
        def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
            super(Decoder, self).__init__()
            self.batch_sz = batch_sz
            self.dec_units = dec_units
            self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
            self.LSTM = nn.LSTM(input_size = embedding_dim, hidden_size = self.dec_units, batch_first=True)
            self.fc = nn.Linear(self.dec_units,vocab_size)
            nn.init.xavier_uniform_(self.LSTM.weight_ih_l0)
            nn.init.orthogonal_(self.LSTM.weight_hh_l0)
            nn.init.constant_(self.LSTM.bias_ih_l0, 0.0)
            nn.init.constant_(self.LSTM.bias_ih_l0[self.dec_units:2*self.dec_units], 1.0)
            nn.init.constant_(self.LSTM.bias_hh_l0, 0.0)

        def forward(self, x, hidden, enc_output):
            x = self.embedding(x)
            output, state = self.LSTM(x,hidden)
            x = self.fc(output)

            return x, state

        def initialize_hidden_state(self):
            return torch.zeros((self.batch_sz,self.dec_units))



    # In[15]:


    class Encap(nn.Module):
        def __init__(self, encoder,decoder):
            super(Encap, self).__init__()
            self.encoder = encoder
            self.decoder = decoder

        def forward(self, inp,targ, hidden, BATCH_SIZE,vocab_tar_size):
            loss = 0
            enc_output, enc_hidden = self.encoder(inp, [hidden,hidden])   
            dec_hidden = enc_hidden
            dec_input = targ[:,:-1]

            predictions, dec_hidden = self.decoder(dec_input, dec_hidden, enc_output)
            loss = loss_function(targ[:,1:], predictions,vocab_tar_size)
            accuracy=acc_function(targ[:,1:], predictions,vocab_tar_size)
            #
            print("Accuracy", accuracy)

            return [loss,accuracy] 


    def create_db(path_to_file):
        input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file)
        # Creating training and validation sets using an 80-20 split
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2,random_state=42)
        vocab_inp_size = len(inp_lang.word2idx)
        vocab_tar_size = len(targ_lang.word2idx)
        return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ

    def loss_function(real, pred,vocab):
        sfmax = nn.LogSoftmax(dim=1)
        cross_ent=F.nll_loss
        pred = pred.view(-1,vocab)
        pred = sfmax(pred)
        real = real.reshape(-1)
        loss = cross_ent(pred, real)
        return loss


    def acc_function(real, pred,vocab):
        total=0
        correct=0
        sfmax = nn.LogSoftmax(dim =1)
        cross_ent=F.nll_loss
        pred = pred.view(-1,vocab)
        pred = sfmax(pred)
        real = real.reshape(-1)
        total += real.size(0)
        values, indices = torch.max(pred, 1)
        correct += (indices == real).sum().item()
        return (correct / total)



    def train(model,epoch,my_dataloader,hidden,BATCH_SIZE,vocab_tar_size,optimizer):
        for (batch, (inp, targ)) in enumerate(my_dataloader):
            model.zero_grad()
            result = model(inp,targ,hidden,BATCH_SIZE,vocab_tar_size)
            loss=result[0]
            acc=result[1]
            loss.backward()
            optimizer.step()
            if batch % 300 == 0:
                print('Batch {} Loss {}'.format(batch,loss))
                print('Batch {} Acc {}'.format(batch,acc))
                sys.stdout.flush()


    def test(model,val_dataloader,hidden,BATCH_SIZE,vocab_tar_size):
        t_loss = 0
        t_acc = 0
        for (batch, (inp, targ)) in enumerate(val_dataloader):

            result = model(inp,targ,hidden,BATCH_SIZE,vocab_tar_size)
            loss=result[0]
            acc=result[1]
            t_loss +=loss
            t_acc +=acc
        print('\n\nValidation Acc :{}'.format(t_acc/batch))
        print('Validation Loss {}'.format(t_loss/batch))
        print('Validation Perplexity {}'.format(math.pow(2,(t_loss/batch))))
        sys.stdout.flush()


    # In[21]:


    def run_tr(epochs,BATCH_SIZE,  embedding_dim, units):
        path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', extract=True)
        path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
        torch.manual_seed(1)
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ = create_db(path_to_file)
        # Get parameters
        BUFFER_SIZE = len(input_tensor_train)
        N_BATCH = BUFFER_SIZE//BATCH_SIZE

        train_samples = len(input_tensor_train)
        val_samples = len(input_tensor_val)
        #data pre-pre processing
        input_tensor_train = np.array(input_tensor_train,dtype = 'int')
        target_tensor_train = np.array(target_tensor_train,dtype = 'int')
        input_tensor_val = np.array(input_tensor_val,dtype = 'int')
        target_tensor_val = np.array(target_tensor_val,dtype = 'int') 
        tensor_x = torch.stack([torch.from_numpy(i) for i in input_tensor_train]) 
        tensor_y = torch.stack([torch.from_numpy(i) for i in target_tensor_train])
        val_x = torch.stack([torch.from_numpy(i) for i in input_tensor_val])
        val_y = torch.stack([torch.from_numpy(i) for i in target_tensor_val])
        my_dataset = utils.TensorDataset(tensor_x,tensor_y) # create your datset
        my_dataloader = utils.DataLoader(my_dataset,batch_size=BATCH_SIZE)
        val_dataset = utils.TensorDataset(val_x,val_y) # create your datset
        val_dataloader = utils.DataLoader(val_dataset,batch_size=BATCH_SIZE)
        #create model
        encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
        decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
        model = Encap(encoder,decoder)

        optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),lr=0.0001)  

        memT,cpuT,gpuT = YN.StartLogger("PyTorch_CPU","Manythings")

        hidden = encoder.initialize_hidden_state()
        epoch=1

        start = time.time()
        current_time = time.time()
        time_consumed=current_time-start

        while (time_consumed <= 86400 and epoch <= epochs):
            print("Epoch ",epoch)
            train(model,epoch,my_dataloader,hidden,BATCH_SIZE,vocab_tar_size,optimizer)
            test(model,val_dataloader,hidden,BATCH_SIZE,vocab_tar_size)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning: ", str(datetime.timedelta(seconds=time_consumed)) )

            sys.stdout.flush()

            if epoch % 10 == 0:
                torch.save(model.state_dict(), 'Pytorch_CPU_Manythings_LSTM_model')


        YN.EndLogger(memT,cpuT,gpuT)
        print("\n\nTotal Time Consumed: ", str(datetime.timedelta(seconds=time_consumed)))
        sys.stdout.flush()


    # In[22]:


    run_tr(n_epochs, BATCH_SIZE = 128,  embedding_dim = 256, units = 256)



In [12]:
def run_ptb(n_epochs):
    
    class PtbIterator:

        def __init__(self, train, batch_size, seq_len, skip_step=5):
            self.data = load_ptb(train)
            self.batch_size = batch_size
            self.seq_len = seq_len
            self.skip_step = skip_step
            self.reset()

        def __iter__(self):
            self.reset()
            return self

        def __next__(self):
            x = np.zeros((self.batch_size, self.seq_len))
            y = np.zeros((self.batch_size, self.seq_len))

            for i in range(self.batch_size):
                if self.cur_idx + self.seq_len >= len(self.data):
                    raise StopIteration
                x[i, :] = self.data[self.cur_idx:self.cur_idx+self.seq_len]
                y[i, :] = self.data[self.cur_idx+1:self.cur_idx+self.seq_len+1]
                self.cur_idx += self.skip_step
            return torch.LongTensor(x), torch.LongTensor(y.ravel())

        def reset(self):
            self.cur_idx = 0


    # In[3]:


    class PtbLstm(nn.Module):
        def __init__(self, vocabulary_size, hidden_size, num_layers, dropout):
            super().__init__()
            self.embed = nn.Embedding(vocabulary_size, hidden_size)
            nn.init.uniform_(self.embed.weight, -1.0, 1.0)

            self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, batch_first=True)
            for i in range(num_layers):
                li = 'l' + str(i)
                nn.init.xavier_uniform_(getattr(self.lstm, 'weight_ih_' + li))
                nn.init.orthogonal_(getattr(self.lstm, 'weight_hh_' + li))
                nn.init.constant_(getattr(self.lstm, 'bias_ih_' + li), 0.0)
                nn.init.constant_(getattr(self.lstm, 'bias_ih_' + li)[hidden_size:2*hidden_size], 1.0)
                nn.init.constant_(getattr(self.lstm, 'bias_hh_' + li), 0.0)

            self.dropout = nn.Dropout(dropout) 
            self.linear = nn.Linear(hidden_size, vocabulary_size)
            nn.init.xavier_uniform_(self.linear.weight)
            nn.init.constant_(self.linear.bias, 0.0)

        def forward(self, x):
            x = self.embed(x)
            o, (h, c) = self.lstm(x)
            o = self.dropout(o)
            o = o.reshape(o.size(0)*o.size(1), o.size(2))
            f = self.linear(o)
            return f


    # In[4]:


    def ptb_train(model, data_iter, criterion, optimizer, epoch, print_every=500):
        model.train()

        losses = []
        correct = 0
        total = 0
        for i, (inputs, labels) in enumerate(data_iter):
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()


            if (i + 1) % print_every == 0:

                print('[%d, %5d] train loss: %.3f' % (epoch, i + 1, np.mean(losses)))
                print('[%d, %5d] train acc: %.3f' % (epoch, i + 1, (correct / total)))
                sys.stdout.flush()  
                losses = []
                correct = 0
                total = 0

    def ptb_test(model, data_iter, criterion, epoch):
        model.eval()
        losses = []
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in data_iter:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                losses.append(loss.item())
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()


        loss = np.mean(losses)
        perplexity = np.exp(loss)
        print('[%d] test loss: %.3f perplexity: %.3f' % (epoch, loss, perplexity))
        #print('Accuracy of the network on the 10000 test images: %d %%' % (correct / total))
        print('[%d] test acc: %.3f ' % (epoch,(correct/total)))
        sys.stdout.flush()



    def ptb_run(n_epochs, hidden_size, batch_size, seq_len, dropout, num_layers):
        torch.manual_seed(1)
        ptb_vocab = load_ptb_vocab()
        vocabulary_size = len(ptb_vocab)

        train_iter = PtbIterator(train=True, batch_size=batch_size, seq_len=seq_len)
        test_iter = PtbIterator(train=False, batch_size=batch_size, seq_len=seq_len)

        model = PtbLstm(vocabulary_size, hidden_size, num_layers, dropout)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adadelta(model.parameters())

        memT,cpuT,gpuT = YN.StartLogger("PyTorch_CPU", "PTB")
        start = time.time()
        epoch=1
        current_time = time.time()
        time_consumed=current_time-start

        while (time_consumed <= 86400 and epoch <= n_epochs):

            ptb_train(model, train_iter, criterion, optimizer, epoch)
            ptb_test(model, test_iter, criterion, epoch)
            epoch += 1
            time_consumed=(time.time())-start
            print("Time since beginning: ", str(datetime.timedelta(seconds=time_consumed)) )
            sys.stdout.flush()
            if epoch % 1 == 0:
                torch.save(model.state_dict(), 'Pytorch_CPU_PTB_LSTM_model')

        YN.EndLogger(memT,cpuT,gpuT)
        end = time.time()
        print("\n\nTotal Time Consumed: ", str(datetime.timedelta(seconds=time_consumed)))
        sys.stdout.flush()
        torch.save(model.state_dict(), 'Pytorch_CPU_PTB_LSTM_model')


    # In[5]:


    ptb_run(n_epochs, hidden_size = 200, batch_size = 20, seq_len = 30, dropout = 0.5, num_layers = 2)


    # In[ ]:


In [None]:
#run_imdb(n_epochs = 50)
#run_manythings(n_epochs = 100)
run_ptb(n_epochs = 50)

[1,   500] train loss: 6.861
[1,   500] train acc: 0.061
