In [5]:
! wget http://tc11.cvc.uab.es/index.php?com=upload&action=file_down&section=dataset&section_id=106&file=112

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer
# https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
"""
The encoder is a stack of bidirectional GRUs while the decoder combines a
unidirectional GRU and an attention mechanism into the recurrent sequence
generator.

We use cross-entropy (CE) as the criterion.
"""
criterion = nn.CrossEntropyLoss()
# encoder = nn.GRU(,, 4, True, False, 0, True)
# m, n = 256, 256
# nPrime, D = 500, 500
# optimizer = optim.Adadelta(params, lr=1.0, rho=0.95, eps=1e-06)
#             optim.Adam

In [3]:
def pad_right_to_length(tensor, after_length, pad_value):
    n=tensor.shape[0]
    if after_length>n:
        tensor=F.pad(tensor,(0,after_length-n))
        for i in range(after_length-n):
            tensor[n+i]=pad_value
    elif out_len<truth_len:
        raise ValueError("after_length is smaller than tensor length. Cannot shrink !")
    return tensor

def list_to_tensor_normalized(tensor_list, pad_value):
    lengths=[tensor.shape[0] for tensor in tensor_list]
    big_tensor=torch.nn.utils.pad_sequence(tensor_list,batch_first=True)
    for i in range(len(tensor_list)):
        big_tensor[i,lenghts[i]:]=pad_value
    return big_tensor

In [10]:
class EquationNet(nn.Module):
    def __init__(self, embedding_dim, vocab_size):
        super(EquationNet,self).__init__()
        self.vocab_size = vocab_size
        #encoder
        self.conv1=nn.Conv2d(1,64,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool1=nn.MaxPool2d((2,2),stride=(2,2))
        self.conv2=nn.Conv2d(64,128,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool2=self.pool1=nn.MaxPool2d((2,2),stride=(2,2))
        self.conv3=nn.Conv2d(128,256,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.bn3=nn.BatchNorm2d(256)
        self.conv4=nn.Conv2d(256,256,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool4=self.pool1=nn.MaxPool2d((2,1),stride=(2,1))
        self.conv5=nn.Conv2d(256,512,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.bn5=nn.BatchNorm2d(512)
        self.pool5=self.pool1=nn.MaxPool2d((2,1),stride=(2,1))
        self.conv6=nn.Conv2d(512,512,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.bn6=nn.BatchNorm2d(512)
        #embeddings
        self.embed=nn.Linear(512,embedding_dim)
        #decoder
        self.blstm=nn.LSTM(input_size=embedding_dim,hidden_size=vocab_size,num_layers=2,batch_first=True,dropout=0.1,bidirectional=True)
    
    def forward(self,inputs):
        #encoding
        #tensor has size (batch_size, color_chans, height, width)
        print("(batch_size, color_chans, height, width)", inputs.shape)
        x=self.conv1(inputs)
        x=self.pool1(x)
        x=self.conv2(x)
        x=self.pool2(x)
        x=self.conv3(x)
        x=self.bn3(x)
        x=self.conv4(x)
        x=self.pool4(x)
        x=self.conv5(x)
        x=self.bn5(x)
        x=self.pool5(x)
        x=self.conv6(x)
        x=self.bn6(x)
        #tensor has size (batch_size, features, new_height, new_width)
        print("(batch_size, features, new_height, new_width)", x.shape)
        #converting 2D to 1D
        x=torch.flatten(x,start_dim=2,end_dim=3)
        #tensor has size (batch_size, features, new_height*new_width)
        print("(batch_size, features, new_height*new_width)", x.shape)
        #transposing to work on features
        x=torch.transpose(x,1,2)
        #embedding
        #tensor has size (batch_size, new_height*new_width, features)
        print("(batch_size, new_height*new_width, features)", x.shape)
        x=self.embed(x)
        #decoding
        #tensor has size (batch_size, new_height*new_width, embedding_dim)
        print("(batch_size, new_height*new_width, embedding_dim)", x.shape)
        x=self.blstm(x)[0]
        #tensor has size (batch_size, new_height*new_width, 2*vocab_size)
        print("(batch_size, new_height*new_width, 2*vocab_size)", x.shape)
        x1=x[:,:,:self.vocab_size]
        x2=x[:,:,self.vocab_size:2*self.vocab_size]
        x=x1+x2
        #tensor has size (batch_size, new_height*new_width, vocab_size)
        x=F.gumbel_softmax(x,hard=True,dim=2)
        return x

In [11]:
def train_net(model, data_loader, loss, num_epochs, pad_value, verbose=True):
    losses=[]
    for epoch in range(num_epochs):
        for sample_id, (sample_data, sample_truth) in enumerate(data_loader):
            output=model(sample_data)
            modified_truth=pad_right_to_length(sample_truth, output.shape[0], pad_value)
            modified_truth=adapt_truth(sample_truth, output)
            optimizer.zero_grad()
            loss_value=loss(output, modified_truth)
            losses.append(loss_value)
            loss.backward()
            optimizer.step()
            if verbose and ((i+1)%len(data_loader))==0:
                print("Epoch[{}/{}], batch[{}/{}], loss : {}".format(epoch+1,num_epochs, batch_id, len(data_loader), loss_value))
    return losses

In [14]:
import glob
import cv2

img_paths = glob.glob("dataset/processed_images/*")
print(img_paths[0])
img = cv2.imread(img_paths[0], cv2.IMREAD_GRAYSCALE)
img_tensor = torch.Tensor(img).unsqueeze(0).unsqueeze(0)

print(img_tensor.shape)

net = EquationNet(300,581)
output = net(img_tensor)

print(output.shape)

dataset/processed_images/6695228021.png
torch.Size([1, 1, 173, 680])
(batch_size, color_chans, height, width) torch.Size([1, 1, 173, 680])
(batch_size, features, new_height, new_width) torch.Size([1, 512, 10, 340])
(batch_size, features, new_height*new_width) torch.Size([1, 512, 3400])
(batch_size, new_height*new_width, features) torch.Size([1, 3400, 512])
(batch_size, new_height*new_width, embedding_dim) torch.Size([1, 3400, 300])
(batch_size, new_height*new_width, 2*vocab_size) torch.Size([1, 3400, 1162])
torch.Size([1, 3400, 581])
