In [1]:
#Importing the required libraries

import torch
import torchvision
import torch.nn as nn
from torch.utils.data import Dataset , DataLoader
import pdb
import os
import tqdm
from torch.autograd import Variable
import torch.optim as optim
from torchvision import datasets , models , transforms
import numpy as np
from PIL import Image
from shutil import copyfile
import json




# Splitting the dataset 


In [2]:
from DatasetUtil import SplitDataset

In [3]:
data_dir = './web/all_data'
output_path = './data/'

In [4]:
#Calling function which will split the folders into train , val ,test folders
SplitDataset(data_dir , output_path)

Output path Exists : ./data/
Path already exists : ./data/data_train


100%|██████████████████████████████████████████████████████████████████████████████| 1400/1400 [01:23<00:00, 16.69it/s]


Path already exists : ./data/data_val


100%|████████████████████████████████████████████████████████████████████████████████| 175/175 [00:09<00:00, 18.22it/s]


Path already exists : ./data/data_test


100%|████████████████████████████████████████████████████████████████████████████████| 175/175 [00:09<00:00, 18.49it/s]


# Creating Tokens from DSL

In [2]:
import BuildVocab as BV

In [3]:

vocab_path = './bootstrap.vocab'

In [4]:
#lets build vocabulary 

vocab = BV.build_vocab(vocab_path)
vocab_size = len(vocab)

Created vocabulary of 19 items


# Defining the Hyperparams

In [5]:
#Params for the Nets
epochs = 100
batch_size = 4
embed_size = 256
hidden_size = 512
learning_rate = 0.001
num_layers = 1
crop_size = 224
shuffle = True
save_after_epochs = 50
log_step = 5
model_path = "./models/"

In [6]:
# See https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning
# This function returns the type of batch output we want from the dataloader
def collate_fn (data):
    # Sort datalist by caption length; descending order
    data.sort(key = lambda data_pair: len(data_pair[1]), reverse=True)
    images, captions = zip(*data)
    
    # Merge images (from tuple of 3D Tensor to 4D Tensor)
    images = torch.stack(images, 0)
    
    # Merge captions (from tuple of 1D tensor to 2D tensor)
    lengths = [len(caption) for caption in captions] # List of caption lengths
    targets = torch.zeros(len(captions), max(lengths)).long()
    
    for i, caption in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = caption[:end]
        
    return images, targets, lengths

In [7]:
#Defining the transformations
transform_imgs = transforms.Compose([transforms.Resize((crop_size , crop_size)) ,
                               transforms.ToTensor(),
                               #For image transformations required by resent see : https://pytorch.org/docs/stable/torchvision/models.html
                               transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])])

In [8]:
#Lets create our dataloader and dataset
from BuildData import CreateDataset
data_dir = './data/data_train/'
custom_dataset = CreateDataset(data_dir = data_dir , vocab = vocab , transform = transform_imgs)


Created a dataset of 1400 items


In [11]:
dataloader = DataLoader(dataset = custom_dataset , 
                       batch_size = batch_size , shuffle = shuffle , 
                       collate_fn = collate_fn)

# Defining the models


In [9]:
import Models


In [13]:
E = Models.EncoderNet(embed_size)
print(E)

EncoderNet(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (downsample): Sequential(
          (0): Conv2d(64, 256, ker

In [14]:
D = Models.DecoderNet(embed_size , hidden_size , vocab_size , num_layers)
print(D)

DecoderNet(
  (embed): Embedding(19, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=19, bias=True)
)


In [15]:
#Defining the optimizers and the loss

criterion = nn.CrossEntropyLoss()

params = list(D.parameters()) + list(E.linear.parameters()) + list(E.BatchNorm.parameters())
optimizer = optim.Adam(params , lr = learning_rate)


In [16]:
#Checking is cuda is available

if torch.cuda.is_available():
    print("GPU is avaiable , Moving models to GPU")
    E.cuda()
    D.cuda()
else:
    print("Using CPU")

GPU is avaiable , Moving models to GPU


In [18]:
#Lets define the training loop
import time
model_path = "./models/" 
batch_count = len(dataloader)


start = time.time()
for epoch in range(epochs):
    #s = 0
    for i , (images , captions , lengths) in enumerate(dataloader):
        
        images = Variable(images.cuda())
        captions = Variable(captions.cuda())
        #lenghts is a list of caption length in descending order
        
        #The collate_fn function does padding to the captions that are short in length
        #so we need to pad our targets too so as to compute the loss
        
        targets = nn.utils.rnn.pack_padded_sequence(input = captions, lengths = lengths, batch_first = True)[0]
        
        #Clearing out buffers
        E.zero_grad()
        D.zero_grad()
        
        features = E(images)
        output = D(features , captions , lengths)
        loss = criterion(output , targets)
        
        loss.backward()
        optimizer.step()
        #s = s + 1
        
        
        if epoch % log_step == 0 and i == 0:
            
            print("Epoch : {} || Loss : {} || Perplexity : {}".format(epoch , loss.item() 
                                                                      , np.exp(loss.item())))
            
        #Uncomment this to use checkpointing
        #if (epoch + 1) % save_after_epochs == 0 and i == 0:
            
            #print("Saving Models")
            #torch.save(E.state_dict , os.path.join(model_path , 'encoder-{}'.format(epoch + 1)))
            #torch.save(D.state_dict , os.path.join(model_path , 'decoder-{}'.format(epoch + 1)))
print("Done Training!")
print("Time : {}".format(time.time() - start))


Epoch : 0 || Loss : 2.93107533454895 || Perplexity : 18.747779796443425
Epoch : 5 || Loss : 0.18356953561306 || Perplexity : 1.2014985094424606
Epoch : 10 || Loss : 0.1269185096025467 || Perplexity : 1.135324495967303
Epoch : 15 || Loss : 0.11341875046491623 || Perplexity : 1.1201008775195693
Epoch : 20 || Loss : 0.12676969170570374 || Perplexity : 1.1351555519348386
Epoch : 25 || Loss : 0.11192330718040466 || Perplexity : 1.1184270820293527
Epoch : 30 || Loss : 0.12203798443078995 || Perplexity : 1.129797015661832
Epoch : 35 || Loss : 0.11023492366075516 || Perplexity : 1.1165403413952222
Epoch : 40 || Loss : 0.12880489230155945 || Perplexity : 1.1374681737168917
Epoch : 45 || Loss : 0.11486397683620453 || Perplexity : 1.1217208471757312
Epoch : 50 || Loss : 0.08669478446245193 || Perplexity : 1.0905637719086705
Epoch : 55 || Loss : 0.09103292226791382 || Perplexity : 1.0953050645916684
Epoch : 60 || Loss : 0.09048262983560562 || Perplexity : 1.094702492314243
Epoch : 65 || Loss : 0.1

In [19]:
torch.save(E.state_dict() , os.path.join(model_path , 'encoder-100.pkl'))
torch.save(D.state_dict() , os.path.join(model_path , 'decoder-100.pkl'))

# Testing the models

In [10]:
from nltk.translate.bleu_score import corpus_bleu , SmoothingFunction

In [11]:
def transform_IdxToWords (inp):
    sampled_caption = []
    
    for idx in inp:
        word = vocab.idx2word[idx]
        sampled_caption.append(word)

        if word == '<END>':
            break

    output = ' '.join(sampled_caption[1:-1])

    output = output.replace(' ,', ',')

    return output.split(' ')

In [12]:
#First using validation files to check our BLEU Scores 
#let's create dataset for validation data
import BuildData
data_val = "./data/data_test/"
val_data = BuildData.CreateDataset(data_dir = data_val , vocab = vocab , transform = transform_imgs)



Created a dataset of 175 items


In [13]:
val_dataloader = DataLoader(dataset = val_data , batch_size = batch_size , shuffle = True , 
                            collate_fn = collate_fn)




In [14]:
#Loading the models trained for 100 epochs
encoder_path = "./models/encoder-100.pkl"
decoder_path = "./models/decoder-100.pkl"

val_encoder = Models.EncoderNet(embed_size)
val_decoder = Models.DecoderNet(embed_size , hidden_size , len(vocab) , num_layers)

#loading the trained models
val_encoder.load_state_dict(torch.load(encoder_path))
val_decoder.load_state_dict(torch.load(decoder_path))

In [15]:
#Moving the models to GPU

if torch.cuda.is_available():
    
    val_encoder.cuda()
    val_decoder.cuda()
    print("Moving to GPU")

Moving to GPU


In [16]:
#Settings the model eval mode
val_encoder.eval()
val_decoder.eval()

predicted , actual = [] , []
data_count = len(val_dataloader.dataset)

for i in range(data_count):
    
    #print("Iteration No : {}".format(i))
    images , captions = val_dataloader.dataset.__getitem__(i)
    
    images = Variable(images.unsqueeze(0).cuda())
    #captions = Variable(captions.cuda())
    
    features = val_encoder(images)
    
    sample_ids = val_decoder.sample(features)
    sample_ids = sample_ids.cpu().data.numpy()
    
    predicted.append(sample_ids)
    actual.append(captions.cpu().numpy())

predicted_labels = [transform_IdxToWords(i) for i in predicted]
actual_labels = [[transform_IdxToWords(i)] for i in actual]




In [17]:
predicted_labels[0]

['btn-inactive', 'btn-green', 'header', 'small-title']

In [None]:
cc = SmoothingFunction()
bleu = corpus_bleu(actual_labels , predicted_labels ,smoothing_function = cc.method4)
print("Score : {}".format(bleu))

In [30]:
import Compiler

In [32]:
dsl_path = 'Assets/web-dsl-mapping.json'
compiler = Compiler.Compiler(dsl_path)


In [34]:
compiled_website = compiler.compile(predicted_labels[12] , './output/page.html')

In [36]:
print(compiled_website)

<html>
  <header>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css" integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
<style>
.header{margin:20px 0}nav ul.nav-pills li{background-color:#333;border-radius:4px;margin-right:10px}.col-lg-3{width:24%;margin-right:1.333333%}.col-lg-6{width:49%;margin-right:2%}.col-lg-12,.col-lg-3,.col-lg-6{margin-bottom:20px;border-radius:6px;background-color:#f5f5f5;padding:20px}.row .col-lg-3:last-child,.row .col-lg-6:last-child{margin-right:0}footer{padding:20px 0;text-align:center;border-top:1px solid #bbb}
</style>
    <title>Scaffol