<a href="https://colab.research.google.com/github/DShergilashvili/DShergilashvili/blob/main/image2code_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras.utils

Collecting keras.utils
  Downloading keras-utils-1.0.13.tar.gz (2.4 kB)
Building wheels for collected packages: keras.utils
  Building wheel for keras.utils (setup.py) ... [?25l[?25hdone
  Created wheel for keras.utils: filename=keras_utils-1.0.13-py3-none-any.whl size=2656 sha256=29defd1e3add17f93fec3944c5ddd53af624d0eddfe58cd1a4721b338eb67648
  Stored in directory: /root/.cache/pip/wheels/d0/dd/3b/493952a5240d486a83805d65360dedadbadeae71d25e2c877f
Successfully built keras.utils
Installing collected packages: keras.utils
Successfully installed keras.utils-1.0.13


In [None]:
import torch.utils.data as data
import cv2
import sys
from os import listdir
from os.path import join
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical


def resize_img(png_file_path):
        img_rgb = cv2.imread(png_file_path)
        img_grey = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2GRAY)
        img_adapted = cv2.adaptiveThreshold(img_grey, 255, cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY, 101, 9)
        img_stacked = np.repeat(img_adapted[...,None],3,axis=2)
        resized = cv2.resize(img_stacked, (224,224), interpolation=cv2.INTER_AREA)
        bg_img = 255 * np.ones(shape=(224,224,3))
        bg_img[0:224, 0:224,:] = resized
        bg_img /= 255
        bg_img = np.rollaxis(bg_img, 2, 0)  
        return bg_img
    
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

class Dataset():
    def __init__(self, data_dir, input_transform=None, target_transform=None):
        self.data_dir = data_dir
        self.image_filenames = []
        self.texts = []
        all_filenames = listdir(data_dir)
        all_filenames.sort()
        for filename in (all_filenames):
            if filename[-3:] == "png":
                self.image_filenames.append(filename)
            else:
                text = '<START> ' + load_doc(self.data_dir+filename) + ' <END>'
                text = ' '.join(text.split())
                text = text.replace(',', ' ,')
                self.texts.append(text)
        self.input_transform = input_transform
        self.target_transform = target_transform
        
        # Initialize the function to create the vocabulary 
        tokenizer = Tokenizer(filters='', split=" ", lower=False)
        # Create the vocabulary 
        tokenizer.fit_on_texts([load_doc('vocabulary.vocab')])
        self.tokenizer = tokenizer
        # Add one spot for the empty word in the vocabulary 
        self.vocab_size = len(tokenizer.word_index) + 1
        # Map the input sentences into the vocabulary indexes
        self.train_sequences = tokenizer.texts_to_sequences(self.texts)
        # The longest set of boostrap tokens
        self.max_sequence = max(len(s) for s in self.train_sequences)
        # Specify how many tokens to have in each input sentence
        self.max_length = 48
        
        X, y, image_data_filenames = list(), list(), list()
        for img_no, seq in enumerate(self.train_sequences):
            in_seq, out_seq = seq[:-1], seq[1:]
            out_seq = to_categorical(out_seq, num_classes=self.vocab_size)
            image_data_filenames.append(self.image_filenames[img_no])
            X.append(in_seq)
            y.append(out_seq)
                
        self.X = X
        self.y = y
        self.image_data_filenames = image_data_filenames
        self.images = list()
        for image_name in self.image_data_filenames:
            image = resize_img(self.data_dir+image_name)
            self.images.append(image)


In [None]:
dir_name = '/content/data/'
batch_size = 32
my_dateset = Dataset(dir_name)


# Model

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet34(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        self.init_weights()
        
    def init_weights(self):
        """Initialize the weights."""
        self.linear.weight.data.normal_(0.0, 0.02)
        self.linear.bias.data.fill_(0)
        
    def forward(self, images):
        """Extract the image feature vectors."""
        features = self.resnet(images)
        features = Variable(features.data)
        features = features.view(features.size(0), -1)
        if images.shape[0] < 2:
            features = self.linear(features)
            return features
        features = self.bn(self.linear(features))
        return features # Bxembed_size

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.n_layers = num_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.GRU(embed_size*2, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        """Initialize weights."""
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, features, captions, hidden):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1).repeat(1,embeddings.shape[1],1), embeddings), 2)
        #packed = pack_padded_sequence(embeddings, 48, batch_first=True) 
        output, hidden = self.lstm(embeddings, hidden)
        outputs = self.linear(output)
        return outputs, hidden
    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size))

In [None]:
embed_size = 50
hidden_size = 256
num_layers = 3
num_epochs = 30

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, my_dateset.vocab_size, num_layers)
criterion = nn.MSELoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params,lr=0.001)

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth


  0%|          | 0.00/83.3M [00:00<?, ?B/s]

In [None]:
for epoch in range(num_epochs):
    for i_batch in range(len(my_dateset.X)):
        hidden = decoder.init_hidden()
        images = Variable(torch.FloatTensor([my_dateset.images[i_batch]]))
        input_seqs = Variable(torch.LongTensor(my_dateset.X[i_batch])).view(1,-1)
        target_seq = Variable(torch.FloatTensor(my_dateset.y[i_batch]))
        encoder.zero_grad()
        decoder.zero_grad()
        features = encoder(images)
        outputs, hidden = decoder(features, input_seqs, hidden)
        loss = 0
        for di in range(target_seq.shape[0]):
            loss += criterion(outputs.squeeze(0)[di], target_seq[di])
        
        loss.backward()
        optimizer.step()

        
    print('Epoch [%d/%d], Loss: %.4f'
       %(epoch+1, num_epochs,
       loss.data)) 
    torch.save(encoder,'encoder_resnet34_'+str(loss.data)+'.pt')
    torch.save(decoder,'decoder_resnet34_'+str(loss.data)+'.pt')

  after removing the cwd from sys.path.


Epoch [1/30], Loss: 0.2655
Epoch [2/30], Loss: 0.2487
Epoch [3/30], Loss: 0.2334
Epoch [4/30], Loss: 0.2378
Epoch [5/30], Loss: 0.1772
Epoch [6/30], Loss: 0.1565
Epoch [7/30], Loss: 0.1492


KeyboardInterrupt: ignored

In [67]:
encoder = torch.load('/content/model_weights/encoder_resnet34_tensor(0.2655).pt')
decoder = torch.load('/content/model_weights/decoder_resnet34_tensor(0.2655).pt')

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def load_val_images(data_dir):
    image_filenames =[]
    images = []
    all_filenames = listdir(data_dir)
    all_filenames.sort()
    for filename in (all_filenames):
        if filename[-3:] == "png":
            image_filenames.append(filename)
    for name in image_filenames:
        image = resize_img(data_dir+name)
        images.append(image)
    return images

In [None]:
decoded_words = []
star_text = '<START> '
hidden = decoder.init_hidden()
image = load_val_images('val/')[0]
image = Variable(torch.FloatTensor([image]))
predicted = '<START> '
for di in range(9999):
    sequence = my_dateset.tokenizer.texts_to_sequences([star_text])[0]
    decoder_input = Variable(torch.LongTensor(sequence)).view(1,-1)
    features = encoder(image)
    #print(decoder_input)
    outputs,hidden = decoder(features, decoder_input,hidden)
    topv, topi = outputs.data.topk(1)
    ni = topi[0][0][0]
    word = word_for_id(ni,my_dateset.tokenizer)
    if word is None:
            continue
    predicted += word + ' '
    star_text = word
    print(predicted)
    if word == '<END>':
            break

<START> header 
<START> header { 
<START> header { btn-inactive 
<START> header { btn-inactive , 
<START> header { btn-inactive , btn-inactive 
<START> header { btn-inactive , btn-inactive , 
<START> header { btn-inactive , btn-inactive , btn-inactive 
<START> header { btn-inactive , btn-inactive , btn-inactive , 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive } 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive } row 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive } row { 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive } row { single 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive } row { single { 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive } row { single { small-title 
<START> header { btn-inactive , btn-inactive , btn-inactive , btn-inactive 

KeyboardInterrupt: ignored

In [None]:
from nltk.translate.bleu_score import sentence_bleu

original_gui = load_doc('val/2BC033FD-F097-463B-98A8-C1C9CE50B478.gui')
original_gui = ' '.join(original_gui.split())
original_gui = original_gui.replace(',', ' ,')
original_gui = original_gui.split()

# Predicted images don't have color so we normalize all buttons to btn-orange or btn-active
btns_to_replace = ['btn-green', 'btn-red']
normalized_original_gui = ['btn-orange' if token in btns_to_replace else token for token in original_gui]
normalized_original_gui = ['btn-active' if token == 'btn-inactive' else token for token in normalized_original_gui]

generated_gui = predicted.split()

normalized_generated_gui = ['btn-orange' if token in btns_to_replace else token for token in generated_gui]
normalized_generated_gui = ['btn-active' if token == 'btn-inactive' else token for token in normalized_generated_gui]

#BLEU score 
print(sentence_bleu([normalized_original_gui],normalized_generated_gui))

0.10014730173627111


In [None]:
import importlib.util

def module_from_file(module_name, file_path):
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

In [62]:
from inference3 import Compiler

In [63]:
compiler = Compiler.Compiler('default')
compiled_website = compiler.compile(predicted.split())

In [64]:
#generated HTML
print(compiled_website)

<html>
  <header>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css" integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
<style>
.header{margin:20px 0}nav ul.nav-pills li{background-color:#333;border-radius:4px;margin-right:10px}.col-lg-3{width:24%;margin-right:1.333333%}.col-lg-6{width:49%;margin-right:2%}.col-lg-12,.col-lg-3,.col-lg-6{margin-bottom:20px;border-radius:6px;background-color:#f5f5f5;padding:20px}.row .col-lg-3:last-child,.row .col-lg-6:last-child{margin-right:0}footer{padding:20px 0;text-align:center;border-top:1px solid #bbb}
</style>
    <title>Scaffol