In [1]:
import torch, json, string
import torch.nn as nn
import torch.utils.data as data
import numpy as np
import nltk
import random
import math
import os
from nltk.tokenize import word_tokenize
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm
from ipywidgets import IntProgress
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
torch.cuda.device_count()

8

In [3]:
NR_GPUS = 8

In [4]:
torch.cuda.is_available()

True

In [5]:
print("CUDA device: , CUDA capability:, CUDA device properties: \n")
print(torch.cuda.get_device_name(1), torch.cuda.get_device_capability(1), torch.cuda.get_device_properties(1))

CUDA device: , CUDA capability:, CUDA device properties: 

GeForce GTX 1080 Ti (6, 1) _CudaDeviceProperties(name='GeForce GTX 1080 Ti', major=6, minor=1, total_memory=11178MB, multi_processor_count=28)


In [6]:
device = torch.device("cuda")
device

device(type='cuda')

In [7]:
fIn = open("/home/alexandru/html/htmlTemplate/inputTest.txt", "r")
files = []
files_sizes = 0
nr = 0
for i in fIn.readlines():
    if "index" in i:
        if nr>10:
            break
        nr +=1
        files_sizes += os.path.getsize("/home/alexandru/html/htmlTemplate/test/"+i[:-1])
        files.append(i[:-1])
fIn.close()

In [8]:
kB = files_sizes/(1024)
mB = kB/1024
print("kB: ", kB)
print("mB: ", mB)

kB:  1.53515625
mB:  0.001499176025390625


In [9]:
PERCENT_OF_TRAINING_DATA = 80
PERCENT_OF_VALIDATION_DATA = 20

In [10]:
nr_training_samples = math.ceil((len(files) * PERCENT_OF_TRAINING_DATA)/100)
nr_validation_samples = len(files) - nr_training_samples

In [11]:
nr_training_samples, nr_validation_samples

(9, 2)

In [12]:
training_html_file_names = files[:nr_training_samples]
validation_html_file_names = files[-nr_validation_samples:]

In [13]:
len(training_html_file_names)

9

In [14]:
len(validation_html_file_names)

2

In [15]:
class HTMLDataset(data.Dataset):
    def __init__(self, file_names, vocabulary=None):
        self.file_names = file_names
        
        if not vocabulary:
            self.build_vocabulary()
        else:
            self.vocabulary = vocabulary
    
    def build_vocabulary(self, vocabulary_size=67):
        letters = dict()
        
        for i in range(len(self.file_names)):
            file_name = "/home/alexandru/html/htmlTemplate/test/" + self.file_names[i]


            fIn = open(file_name, "r")

            for line in fIn.readlines():
                line = line.strip()
                line = line.replace('\n', ' ')
                line = line.replace('\t', ' ')
                line = line.replace('  ', ' ')
                line = list(line.lower())

                for character in line:
                    letters[character] = letters.get(character, 0) + 1
            fIn.close()
        
        sorted_letters = sorted(list(letters.items()), key=lambda x: -x[1])
        most_frequent_letters = [w for (w, c) in sorted_letters[:vocabulary_size]]
        letter2id = {w: ( index + 1 ) for (index, w) in enumerate(most_frequent_letters)}
        
        letter2id['[START]'] = len(letter2id) + 1
        letter2id['[UNK]'] = len(letter2id) + 1
        letter2id['[END]'] = len(letter2id) + 1
        
        
        id2letter = {index: letter for (letter, index) in letter2id.items()}
        
        self.vocabulary = {'letter2id':letter2id, 'id2letter':id2letter}        
        
    def html2ids(self, html_file):
        letter2id = self.vocabulary['letter2id']
        
        html_ids = []
        
        file_name = "/home/alexandru/html/htmlTemplate/test/" + html_file
        
        fIn = open(file_name, "r")
        
        for line in fIn.readlines():
            line = line.strip()
            line = line.replace('\n', ' ')
            line = line.replace('\t', ' ')
            line = line.replace('  ', ' ')
            line = list(line.lower())

            for character in line:
                html_ids.append(letter2id.get(character, letter2id['[UNK]']))
        
        fIn.close()
        
        html_ids.insert(0, letter2id['[START]'])
        html_ids.append(letter2id['[END]'])
        
        return torch.LongTensor(html_ids)
    
    
    def ids2letters(self, html_ids):
        id2letter = self.vocabulary['id2letter']
        return "".join([id2letter[i] for i in html_ids])
    
    def __getitem__(self, index):
        html_file = self.file_names[index]
        return self.html2ids(html_file)
    
    def __len__(self):
        return len(self.file_names)

In [16]:
trainData = HTMLDataset(training_html_file_names)

In [17]:
validationData = HTMLDataset(validation_html_file_names, vocabulary=trainData.vocabulary)

In [18]:
id2letter = trainData.vocabulary['id2letter']
trainData.ids2letters(trainData[0].tolist())

'[START]<!doctype html><html><head><title>page title</title></head><body><h1>this is a heading</h1><p>this is a paragraph.</p></body></html>[END]'

In [19]:
def customBatchBuilder(samples):
    
    seqLengths = [len(seq) for seq in samples]
    maxSeqLength = max(seqLengths)
    sorted_list = sorted(zip(samples, seqLengths), key = lambda x: -x[1])
    
    captionSeqs, seqLengths = zip(*sorted_list)
    
    trainPaddedSeqs = torch.LongTensor(len(seqLengths), maxSeqLength-1) # input and output are shifted from one another
    trainPaddedSeqs.fill_(0)
    targetPaddedSeqs = torch.LongTensor(len(seqLengths), maxSeqLength-1)
    targetPaddedSeqs.fill_(0)
    
    lengths = []
    
    for (i, seq) in enumerate(captionSeqs):
        length = len(seq) - 1
        trainPaddedSeqs[i, :length] = seq[:-1]
        lengths.append(length)
        targetPaddedSeqs[i, :length] = seq[1:]
  
    
    return trainPaddedSeqs, targetPaddedSeqs, lengths

In [20]:
trainLoader = data.DataLoader(trainData,
                             batch_size = 8,
                             shuffle = True,
                             num_workers = 0,
                             collate_fn = customBatchBuilder)

In [21]:
validationLoader = data.DataLoader(validationData,
                                  batch_size = 1,
                                  shuffle = False,
                                  num_workers = 0,
                                  collate_fn = customBatchBuilder)

In [22]:
for i in enumerate(trainLoader):
    print(i)

(0, (tensor([[25,  1, 21,  ..., 16, 11,  2],
        [25,  1, 21,  ..., 16, 11,  2],
        [25,  1, 21,  ..., 16, 11,  2],
        ...,
        [25,  1, 21,  ..., 16, 11,  2],
        [25,  1, 21,  ..., 16, 11,  2],
        [25,  1, 21,  ..., 16, 11,  2]]), tensor([[ 1, 21,  9,  ..., 11,  2, 27],
        [ 1, 21,  9,  ..., 11,  2, 27],
        [ 1, 21,  9,  ..., 11,  2, 27],
        ...,
        [ 1, 21,  9,  ..., 11,  2, 27],
        [ 1, 21,  9,  ..., 11,  2, 27],
        [ 1, 21,  9,  ..., 11,  2, 27]]), [133, 133, 133, 133, 133, 133, 133, 133]))
(1, (tensor([[25,  1, 21,  9, 14, 22,  3, 15, 10,  6,  7,  4,  3, 16, 11,  2,  1,  4,
          3, 16, 11,  2,  1,  4,  6,  5,  9,  2,  1,  3,  8,  3, 11,  6,  2, 10,
          5, 17,  6,  7,  3,  8,  3, 11,  6,  1, 12,  3,  8,  3, 11,  6,  2,  1,
         12,  4,  6,  5,  9,  2,  1, 18, 14,  9, 15,  2,  1, 10,  2,  3,  4,  8,
         13,  7,  8, 13,  7,  5,  7, 10,  5, 20,  5, 17, 20,  5, 10,  4,  1, 12,
         10,  2,  1,  4, 19,  2,

In [23]:
class Model(nn.Module):
    def __init__(self,
                vocabulary_size,
                hidden_size,
                num_layers,
                rnn_dropout,
                embedding_size,
                dropout,
                num_directions):
        super(Model, self).__init__()
        
        self.vocabulary_size = vocabulary_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn_dropout = rnn_dropout
        self.dropout = dropout
        self.num_directions = num_directions
        self.embedding_size = embedding_size
        
        
        self.embeddings = nn.Embedding(self.vocabulary_size, self.embedding_size)
        self.rnn = nn.GRU(self.embedding_size,
                          self.hidden_size,
                          num_layers=self.num_layers,
                          bidirectional=True if self.num_directions==2 else False,
                          dropout=self.rnn_dropout,
                          batch_first=True)
        self.linear = nn.Linear(self.hidden_size*self.num_directions, self.vocabulary_size)
        
        
        

        
    def forward(self, paddedSeqs):
        
        batchSequenceLength = paddedSeqs.size(1)
        
        batchSize = paddedSeqs.size(0)
        
        lengths = paddedSeqs.ne(0).sum(dim=1)
                  
        embeddingVectors = self.embeddings(paddedSeqs)
        
        #print("Embedding vector shape is: ", embeddingVectors.shape)
        
        x = torch.nn.utils.rnn.pack_padded_sequence(embeddingVectors, lengths, batch_first=True)
        
        self.rnn.flatten_parameters()
        
        #print("After pack padded seq, input shape is: ", x.shape)
        hidden = self.init_hidden(paddedSeqs)
        x,hid = self.rnn(x, hidden)
        
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0, total_length=batchSequenceLength)
        
        
        #print("Output after pad packed sequence: ", output.shape)
        
        predictions = self.linear(output)
        
        #print("Predictions after the linear layer have shape: ", predictions.shape)
        
        #return predictions.view(batchSize, self.vocabulary_size, batchSequenceLength), hid
        
        return predictions.view(batchSize, self.vocabulary_size, batchSequenceLength), hid
                
        #return predictions, hidden
        
        #return predictions.view(-1, self.vocabularySize), hidden
    
        #return predictions, hidden
    
    
    def init_hidden(self, paddedSeqs):
        #nr = 3
        #if paddedSeqs.size(0) == 12:
        #    nr=2
        #elif paddedSeqs.size(0)<12:
        #    nr=1
        #Hidden (num_layers * num_directions, batch_size, hidden_size
        hidden = Variable(torch.zeros(self.num_layers*self.num_directions,
                                    paddedSeqs.size(0),#2, if paddedSeqs.size(0)==NR_GPUS else int(paddedSeqs.size(0)%NR_GPUS),
                                    self.hidden_size).to(device))
        return hidden

In [24]:
class ModelOne(Model) :
    def __init__(self,
                vocabulary_size,
                hidden_size,
                num_layers,
                rnn_dropout,
                embedding_size,
                dropout,
                num_directions):
        super(Model, self).__init__()
        
        self.vocabulary_size = vocabulary_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn_dropout = rnn_dropout
        self.dropout = dropout
        self.num_directions = num_directions
        self.embedding_size = embedding_size
        
        
        self.embeddings = nn.Embedding(self.vocabulary_size, self.embedding_size)
        self.rnn = nn.GRU(self.embedding_size,
                          self.hidden_size,
                          num_layers=self.num_layers,
                          bidirectional=True if self.num_directions==2 else False,
                          dropout=self.rnn_dropout,
                          batch_first=True)
        self.linear = nn.Linear(self.hidden_size*self.num_directions, self.vocabulary_size)
        
        
        

        
    def forward(self, paddedSeqs, hidden):
        
        batchSequenceLength = paddedSeqs.size(1)
        
        batchSize = paddedSeqs.size(0)
        
        lengths = paddedSeqs.ne(0).sum(dim=1)
                  
        embeddingVectors = self.embeddings(paddedSeqs)
        
        #print("Embedding vector shape is: ", embeddingVectors.shape)
        
        x = torch.nn.utils.rnn.pack_padded_sequence(embeddingVectors, lengths, batch_first=True)
        
        self.rnn.flatten_parameters()
        
        #print("After pack padded seq, input shape is: ", x.shape)
        #hidden = self.init_hidden(paddedSeqs)
        x,hid = self.rnn(x, hidden)
        
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0, total_length=batchSequenceLength)
        
        
        #print("Output after pad packed sequence: ", output.shape)
        
        predictions = self.linear(output)
        
        #print("Predictions after the linear layer have shape: ", predictions.shape)
        
        #return predictions.view(batchSize, self.vocabulary_size, batchSequenceLength), hid
        
        return predictions.view(batchSize, self.vocabulary_size, batchSequenceLength), hid
                
        #return predictions, hidden
        
        #return predictions.view(-1, self.vocabularySize), hidden
    
        #return predictions, hidden
    
    
    def init_hidden(self, paddedSeqs):
        #nr = 3
        #if paddedSeqs.size(0) == 12:
        #    nr=2
        #elif paddedSeqs.size(0)<12:
        #    nr=1
        #Hidden (num_layers * num_directions, batch_size, hidden_size
        hidden = Variable(torch.zeros(self.num_layers*self.num_directions,
                                    1,#paddedSeqs.size(0),#2, if paddedSeqs.size(0)==NR_GPUS else int(paddedSeqs.size(0)%NR_GPUS),
                                    self.hidden_size).to(device))
        return hidden

In [25]:
a = torch.Tensor([4])
a%8

tensor([4.])

In [47]:
vocabularySize = len(trainData.vocabulary['letter2id'])+1
model = Model(vocabulary_size=vocabularySize,
             hidden_size=100,
             embedding_size=50,
             num_layers=3,
             rnn_dropout=0.0,
             dropout=0,
             num_directions=2).to(device)
model = nn.DataParallel(model)

In [48]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [49]:
print(count_parameters(model))

460628


In [51]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [52]:
train_losses = []
train_accuracy = []
valid_losses = []
valid_accuracy = []
train_epochs = []
validation_epochs = []

In [53]:
def accuracy(prediction, label, mask):
    correct = 0
    for (pred, target, msk) in zip(prediction, label, mask):
        correct += pred[0][0:msk].eq(target[0][0:msk]).to(device).sum()
    return correct

In [54]:
def plot_results(plot_title, folder_name, x_label, y_label, x_data, y_data):
    plt.title(plot_title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    
    plt.plot(x_data, y_data, color = "r")
    plt.grid()
    plt.savefig("/home/alexandru/workspace/" + folder_name + "/" + plot_title + ".png", bbox_inches = "tight")

In [55]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [56]:
def train(epoch):
    model.train()
    
    correct = 0
    counter = 0
    cummulative_loss = 0
    total_loss = 0
    total_accuracy = 0
    
    for index, (input, target, lengths) in enumerate(trainLoader):
        
        input = Variable(input.to(device))
        target = Variable(target.to(device))
        
        #hidden = model.module.init_hidden(input)
        
        lengths = Variable(torch.tensor(lengths).to(device))
        
        #hidden = repackage_hidden(hidden)
             
        
        output, _ = model(input)
        
        optimizer.zero_grad()        
        
        loss = criterion(output, target)
        
        cummulative_loss += loss.item()
        
        pred = output.data.max(1, keepdim=True)[1]
        
        correct += accuracy(pred, target.data.view_as(pred), lengths)
        
        total_loss = cummulative_loss/(index+1)
        
        counter += output.size(0) * output.size(2)
        
        total_accuracy = 100. * correct/counter
        
        train_losses.append(total_loss)
        train_accuracy.append(total_accuracy)
        train_epochs.append(epoch)
        
        loss.backward()
        
        optimizer.step()
        
        
    print("[Epoch: %d][Batch:%d][Training Accuracy: %g %%][Loss: %g]"%(epoch,
                                                            index+1,
                                                            total_accuracy,
                                                            cummulative_loss/(index+1)))
        
        

In [57]:
def evaluate(epoch):
    model.eval()
    correct = 0
    test_loss = 0
    total_loss = 0
    total_accuracy = 0
    counter = 0
    with torch.no_grad():
        for index, (input, target, lengths) in enumerate(validationLoader):
            
            input = Variable(input.to(device))
            target = Variable(target.to(device))
            lengths = Variable(torch.tensor(lengths).to(device))
            #hidden = model.module.init_hidden(input)
                        
            output, _ = model(input)
            
            loss = criterion(output, target)
            
            test_loss += loss.item()
            
            pred = output.data.max(1, keepdim = True)[1]
            
            correct += accuracy(pred, target.data.view_as(pred), lengths)
            
            counter += output.size(0) * output.size(2)
            
            total_loss = test_loss/(index+1)
            
            total_accuracy = 100. * correct/counter
            
            valid_losses.append(total_loss)
            
            valid_accuracy.append(total_accuracy)
            
            validation_epochs.append(epoch)
            
        print("\n[Epoch: %d] Validation accuracy: %g %%, Validation loss: %g "%(epoch, total_accuracy, total_loss))
            

In [58]:
for epoch in range(1000):
    train(epoch)
    evaluate(epoch)
    #name = "/home/alexandru/html/model/html/IndexTemplateHiddenGpu8Layer3Hidden500Embed50Dropout1Epoch" + str(epoch)
    #torch.save(model.module.state_dict(), name)

[Epoch: 0][Batch:2][Training Accuracy: 2 %][Loss: 3.31049]

[Epoch: 0] Validation accuracy: 19 %, Validation loss: 3.21676 
[Epoch: 1][Batch:2][Training Accuracy: 19 %][Loss: 3.18871]

[Epoch: 1] Validation accuracy: 21 %, Validation loss: 3.09214 
[Epoch: 2][Batch:2][Training Accuracy: 20 %][Loss: 3.06809]

[Epoch: 2] Validation accuracy: 21 %, Validation loss: 2.97131 
[Epoch: 3][Batch:2][Training Accuracy: 20 %][Loss: 2.953]

[Epoch: 3] Validation accuracy: 21 %, Validation loss: 2.84647 
[Epoch: 4][Batch:2][Training Accuracy: 20 %][Loss: 2.82727]

[Epoch: 4] Validation accuracy: 21 %, Validation loss: 2.70004 
[Epoch: 5][Batch:2][Training Accuracy: 20 %][Loss: 2.68764]

[Epoch: 5] Validation accuracy: 24 %, Validation loss: 2.57101 
[Epoch: 6][Batch:2][Training Accuracy: 22 %][Loss: 2.57276]

[Epoch: 6] Validation accuracy: 25 %, Validation loss: 2.46699 
[Epoch: 7][Batch:2][Training Accuracy: 24 %][Loss: 2.47099]

[Epoch: 7] Validation accuracy: 30 %, Validation loss: 2.3439 
[Epo

In [59]:
torch.save(model.module.state_dict(), "/home/alexandru/html/model/html/Test")

In [60]:
vocabularySize = len(trainData.vocabulary['letter2id'])+1
modelOne =ModelOne(vocabulary_size=vocabularySize,
             hidden_size=100,
             embedding_size=50,
             num_layers=3,
             rnn_dropout=0.0,
             dropout=0,
             num_directions=2).to(device)
modelOne.load_state_dict(torch.load("/home/alexandru/html/model/html/Test"))

In [65]:
def sample_sentence():
    words = list()
    count = 0
    modelOne.eval()
    with torch.no_grad():
        # Setup initial input state, and input word (we use "the").
        previousWord = torch.LongTensor(1, 1).fill_(trainData.vocabulary['letter2id']['[START]'])
        hidden =  Variable(torch.zeros(6, 1, 100).to(device))


        while True:
            # Predict the next word based on the previous hidden state and previous word.
            inputWord = torch.autograd.Variable(previousWord.to(device))
            
            #print("Prediction: ")
            
            predictions, newHidden = modelOne(inputWord, hidden)
            
            hidden = newHidden
            
            #print("After prediction")
            
            pred = torch.nn.functional.softmax(predictions.squeeze()).data.cpu().numpy().astype('float64')
            
            pred = pred/np.sum(pred)
            
            #print("After normalization")
            
            nextWordId = np.random.multinomial(1, pred, 1).argmax()

            #print("After nextWordId computation")
            
            if nextWordId == 0:
                continue
            
            #print("After nextWordId zero value check")

            words.append(trainData.vocabulary['id2letter'][nextWordId])
            # Setup the inputs for the next round.
            previousWord.fill_(nextWordId)


            # Keep adding words until the [END] token is generated.
            if nextWordId == trainData.vocabulary['letter2id']['[END]']:
                break

            if count>20000:
                break
            count += 1
        words.insert(0, '[START]')


        return words

In [66]:
def generate_sample_html():
    html = sample_sentence()
    print(len(html))
    print("".join(html))

In [84]:
generate_sample_html()

57
[START]..<a<a<a<a<aa<ttp11111b11111b11111111b11b1bbbb<btttn111[END]


