In [5]:
! wget http://tc11.cvc.uab.es/index.php?com=upload&action=file_down&section=dataset&section_id=106&file=112

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer
import os
import numpy as np
from torch.utils.data import Dataset, DataLoader
from cv2 import imread, IMREAD_GRAYSCALE
from torch.nn.utils.rnn import pad_sequence
import glob

# https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
"""
The encoder is a stack of bidirectional GRUs while the decoder combines a
unidirectional GRU and an attention mechanism into the recurrent sequence
generator.

We use cross-entropy (CE) as the criterion.
"""

criterion = nn.CrossEntropyLoss(reduction='mean')

In [23]:
#path = 'C:\\Users\\Benjamin\\Desktop\\BensFolder\\School\\ENS\\Saclay\\M1\\DeepLearning\\Project\\im2latex\\processed\\'

#os.chdir(path)

f = open('dataset/formulas.lst')
lines = f.read().splitlines()
f.close()

sentences = []

startWord = 'start'
endWord = 'end'
padWord = 'pad'

wordsToIndexes = {}
indexesToWords = {}
wordsCount = {}

# to force these special words to have indexes 0, 1 and 2
for word in [startWord, endWord, padWord]:
    index = len(wordsToIndexes)
    wordsToIndexes[word] = index
    indexesToWords[index] = word

for line in lines:
    words = line.split()
    words = list(filter(('\\,').__ne__, words))
    for word in words:
        if not word in wordsToIndexes:
            index = len(wordsToIndexes)
            wordsToIndexes[word] = index
            indexesToWords[index] = word
            wordsCount[word] = 0
        wordsCount[word] += 1

    words = [startWord] + words + [endWord]
    words = [wordsToIndexes[word] for word in words]

    sentences += [words]
    
vocab_size=len(indexesToWords)

tensor_sentences=[torch.LongTensor(sentence) for sentence in sentences] #require vocabulary.py
big_truth_tensor=pad_sequence(tensor_sentences,batch_first=True,padding_value=wordsToIndexes[padWord])

In [None]:
class LatexDatasetTrain(Dataset) :

    def __init__(self, root_dir, big_truth_tensor):
        # Root directory
        self.root_dir = root_dir
        # Items :
        # self.id_to_picname dictionnary id -> picture name inside folder
        # self.truth_tensor truth_tensor[id] is the ground truth for the picture id_to_picname[id]
        
        # Train filter
        train = open(self.root_dir + "filter_train.lst")
        train_filter = train.read().splitlines()
        train.close()
        self.id_to_picname={}
        indexes=[]
        for i in range(len(train_filter)) :
            self.id_to_picname[i],val = train_filter[i].split()
            val = int(val)
            indexes.append(val)
        self.truth_tensor=torch.index_select(big_truth_tensor, dim=0, torch.LongTensor(indexes))



    def __len__(self):
        return len(self.id_to_picname)

    def __getitem__(self, idx) :
        truth=self.truth_tensor[idx]
        img = imread(self.root_dir + "formula_images_processed_padded_optimized/" + self.id_to_picname[idx], IMREAD_GRAYSCALE)
        img_tensor = torch.Tensor(img).unsqueeze(0)
        return (img_tensor, truth)

In [None]:
class LatexDatasetTest(Dataset) :

    def __init__(self, root_dir, big_truth_tensor):
        # Root directory
        self.root_dir = root_dir
        # Items :
        # self.id_to_picname dictionnary id -> picture name inside folder
        # self.truth_tensor truth_tensor[id] is the ground truth for the picture id_to_picname[id]
        
        # Test filter
        test = open(self.root_dir + "filter_test.lst")
        test_filter = test.read().splitlines()
        test.close()
        self.id_to_picname={}
        indexes=[]
        for i in range(len(test_filter)) :
            self.id_to_picname[i],val = test_filter[i].split()
            val = int(val)
            indexes.append(val)
        self.truth_tensor=torch.index_select(big_truth_tensor, dim=0, torch.LongTensor(indexes))

    def __len__(self):
        return len(self.id_to_picname)

    def __getitem__(self, idx) :
        truth=self.truth_tensor[idx]
        img = imread(self.root_dir + "formula_images_processed_padded_optimized/" + self.id_to_picname[idx], IMREAD_GRAYSCALE)
        img_tensor = torch.Tensor(img).unsqueeze(0)
        return (img_tensor, truth)

In [None]:
def my_collate(list_of_couple):
    #takes an input like [(tensor1.0, tensor2.0), (tensor1.1, tensor2.1),....]
    #outputs (torch.stack([tensor1.0, tensor1.1,...]), torch.stack([tensor2.0, tensor2.1,...]))
    list1,list2=map(list, zip(*list_of_couple))
    return (torch.stack(list1), torch.stack(list2))
    
#precising collate_fn function may be optionnal thanks to pre-treatment of truthes
train_dataset = LatexDatasetTrain("dataset/", big_truth_tensor)
dataloader=DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=lambda x: default_collate(x).to(device))

In [10]:
class EquationNet(nn.Module):
    def __init__(self, embedding_dim, vocab_size,debugging=False):
        super(EquationNet,self).__init__()
        self.debug=debugging
        self.vocab_size = vocab_size
        #encoder
        self.conv1=nn.Conv2d(1,64,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool1=nn.MaxPool2d((2,2),stride=(2,2))
        self.conv2=nn.Conv2d(64,128,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool2=self.pool1=nn.MaxPool2d((2,2),stride=(2,2))
        self.conv3=nn.Conv2d(128,256,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.bn3=nn.BatchNorm2d(256)
        self.conv4=nn.Conv2d(256,256,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool4=self.pool1=nn.MaxPool2d((2,1),stride=(2,1))
        self.conv5=nn.Conv2d(256,512,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.bn5=nn.BatchNorm2d(512)
        self.pool5=self.pool1=nn.MaxPool2d((2,1),stride=(2,1))
        self.conv6=nn.Conv2d(512,512,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.bn6=nn.BatchNorm2d(512)
        #embeddings
        self.embed=nn.Linear(512,embedding_dim)
        #decoder
        self.blstm=nn.LSTM(input_size=embedding_dim,hidden_size=vocab_size,num_layers=2,batch_first=True,dropout=0.1,bidirectional=True)
    
    def forward(self,inputs):
        #encoding
        #tensor has size (batch_size, color_chans, height, width)
        if self.debug:
            print("(batch_size, color_chans, height, width)", inputs.shape)
        x=self.conv1(inputs)
        x=self.pool1(x)
        x=self.conv2(x)
        x=self.pool2(x)
        x=self.conv3(x)
        x=self.bn3(x)
        x=self.conv4(x)
        x=self.pool4(x)
        x=self.conv5(x)
        x=self.bn5(x)
        x=self.pool5(x)
        x=self.conv6(x)
        x=self.bn6(x)
        #tensor has size (batch_size, features, new_height, new_width)
        if self.debug:
            print("(batch_size, features, new_height, new_width)", x.shape)
        #converting 2D to 1D
        x=torch.flatten(x,start_dim=2,end_dim=3)
        #tensor has size (batch_size, features, new_height*new_width)
        if self.debug:
            print("(batch_size, features, new_height*new_width)", x.shape)
        #transposing to work on features
        x=torch.transpose(x,1,2)
        #embedding
        #tensor has size (batch_size, new_height*new_width, features)
        if self.debug:
            print("(batch_size, new_height*new_width, features)", x.shape)
        x=self.embed(x)
        #decoding
        #tensor has size (batch_size, new_height*new_width, embedding_dim)
        if self.debug:
            print("(batch_size, new_height*new_width, embedding_dim)", x.shape)
        x=self.blstm(x)[0]
        #tensor has size (batch_size, new_height*new_width, 2*vocab_size)
        if self.debug:
            print("(batch_size, new_height*new_width, 2*vocab_size)", x.shape)
        x1=x[:,:,:self.vocab_size]
        x2=x[:,:,self.vocab_size:2*self.vocab_size]
        x=x1+x2
        #tensor has size (batch_size, new_height*new_width, vocab_size)
        x=F.gumbel_softmax(x,hard=True,dim=2)
        return x

In [11]:
def train_net(model, optimizer, data_loader, loss, num_epochs, pad_value, verbose=True):
    losses=[]
    for epoch in range(num_epochs):
        for batch_id, (batch_data, batch_truth) in enumerate(data_loader):
            output=model(batch_data)
            length_diff=output.shape[1]-sample_truth.shape[1]
            if length_diff>=0:
                modified_truth=torch.pad(sample_truth, (0,0,0,length_diff), value=pad_value)
            else:
                raise RuntimeError("Model cannot express truth, it is limited")
            optimizer.zero_grad()
            loss_value=loss(output, modified_truth)
            losses.append(loss_value)
            loss.backward()
            optimizer.step()
            if verbose and ((batch_id+1)%10)==0:
                print("Epoch[{}/{}], batch[{}/{}], loss : {}".format(epoch+1,num_epochs, batch_id, len(data_loader), loss_value))
    return losses

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device -> " + str(torch.cuda.get_device_name(device) if torch.cuda.is_available() else "CPU") + "\n")

net = EquationNet(300,581,debugging=True)
net.to(device)
optimizer = optim.Adam(net, lr=5e-5)

losses = train_net(net, optimizer, dataloader, criterion, 1, wordsToIndexes[padWord])