# Non-Competitive Part - Encoder Decoder

In [2]:
import numpy as np
import pandas as pd

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

1. Model

In [4]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=5)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=5)
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=5)
        self.pool5 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.avgpool = nn.AvgPool2d(kernel_size=3)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)
        x = F.relu(self.conv4(x))
        x = self.pool4(x)
        x = F.relu(self.conv5(x))
        x = self.pool5(x)
        x = self.avgpool(x)
        return x

In [5]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, embedding_dim):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim*2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    # def forward(self, input, hidden, cell):
    #     input = input.unsqueeze(0)
    #     embedded = self.embedding(input)
    #     output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
    #     output = self.softmax(self.out(output[0]))
    #     return output, hidden, cell

    def forward(self, context, input, hidden):
        context = context.squeeze().unsqueeze(1)
        embedded = self.embedding(input)

        # print("Context", context.shape)
        # print("Embed", embedded.shape)

        embedded = torch.cat((context, embedded), dim=2)
        output, hidden = self.lstm(embedded, hidden)
        
        # print("Out", output.shape)
        # output = output.squeeze(1)
        output = self.out(output)
        # print("Outnew", output.shape)
        # print(hidden)
        # output = self.softmax(output)

        return output, hidden

In [6]:
a = torch.tensor([12,1,14,213,41,121])
torch.argmax(a)

tensor(3)

In [7]:
a.view(-1)

tensor([ 12,   1,  14, 213,  41, 121])

In [8]:
class EncDec(nn.Module):
    def __init__(self, vocab, word_to_index, index_to_word, hidden_size=512, embedding_dim=512):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(len(vocab), hidden_size, embedding_dim)
        self.out_size = len(vocab)

        self.vocab = vocab
        self.word_to_index = word_to_index
        self.index_to_word = index_to_word

    def forward(self, images, str_input, hidden):
        context_img = self.encoder.forward(images)
        batch_size, _, _, _ = context_img.size()

        context_img = context_img.view(batch_size, -1)
        output, _ = self.decoder.forward(context_img, str_input, hidden)
        
        return output

2. Generating vocabulary

In [9]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [10]:
df = pd.read_csv('./col_774_A4_2023/SyntheticData/train.csv')

# added Start and End tokens
vocab = set()
for formula in df['formula']:
    tokens = formula.split()
    vocab.update(tokens)

In [11]:
word_to_index = {'<PAD>':0, '<SOF>':1, '<EOF>':2}
word_to_index.update({word: i+3 for i, word in enumerate(vocab)})

vocab.update({'<SOF>', '<EOF>', '<PAD>'})

embed_dim = 512
embed_matrix = nn.Embedding(len(vocab), embed_dim)

In [12]:
embed_matrix(torch.tensor(word_to_index['<SOF>'])).shape

torch.Size([512])

In [13]:
trial = [word_to_index[word] for word in df['formula'][1].split()]

embed_vec = embed_matrix(torch.tensor(trial))
embed_vec.shape

torch.Size([66, 512])

In [14]:
index_to_word = {word_to_index[w]:w for w in word_to_index}
index_to_word

{0: '<PAD>',
 1: '<SOF>',
 2: '<EOF>',
 3: 'P',
 4: '\\slash',
 5: '\\sim',
 6: '2',
 7: '3.2',
 8: '\\ddagger',
 9: '\\bullet',
 10: '\\coprod',
 11: '\\vdash',
 12: '\\vector',
 13: '\\mathrm',
 14: '\\thicklines',
 15: ')',
 16: '\\mid',
 17: '\\partial',
 18: '\\makebox',
 19: '\\symbol',
 20: '\\Biggl',
 21: 'N',
 22: '\\cup',
 23: '\\S',
 24: '\\tabcolsep',
 25: '0.1',
 26: '^',
 27: '\\underline',
 28: '?',
 29: '\\overline',
 30: '\\arraycolsep',
 31: '\\textnormal',
 32: '\\int',
 33: '\\ddot',
 34: '\\',
 35: '\\subset',
 36: '\\#',
 37: '\\fill',
 38: '\\Longleftrightarrow',
 39: '\\Bigm',
 40: 'R',
 41: '\\otimes',
 42: '\\tt',
 43: '\\stackrel',
 44: 'K',
 45: '\\longrightarrow',
 46: '\\fbox',
 47: '\\overwithdelims',
 48: '\\mp',
 49: '\\hline',
 50: 'm',
 51: '\\bigtriangledown',
 52: '\\ll',
 53: '\\label',
 54: '\\everymath',
 55: '\\scriptstyle',
 56: '\\prime',
 57: '\\textcircled',
 58: 'n',
 59: '\\ddag',
 60: '\\AA',
 61: '\\phi',
 62: '\\tau',
 63: '0.9',
 64: '

3. Data Loading

In [15]:
from PIL import Image
import torchvision.transforms as transforms

In [16]:
# transform image array
tf_resize_normalize = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(0, 1)
])

In [17]:
img = df.iloc[0,0]
imgaa = Image.open(f'./col_774_A4_2023/SyntheticData/images/{img}')
aa = tf_resize_normalize(imgaa)
type(imgaa)

PIL.PngImagePlugin.PngImageFile

-- useless --

In [17]:
def formulate(f, w2i=word_to_index):
    f = f.split()
    f = ['<SOF>']+f+['<EOF>']
    f_idx = [w2i[word] for word in f]
    f_tensor = torch.as_tensor(f_idx)

    return f_tensor

def add_formula_padded(df):
    # TODO: doesn't work some some nasty reason
    # formula = df['formula'].map(lambda x: formulate(x))
    
    formula = []
    for f in df['formula']:
        formula.append(formulate(f))

    fk = pad_sequence(formula)
    fk = fk.T.tolist()
    fk = list(map(lambda x: torch.tensor(x), fk))
    # for i in range(len(fk)):
    #     fk[i] = torch.tensor(fk[i])

    # df[col] = fk
    return fk

In [18]:
formulas_padded = add_formula_padded(df)

In [19]:
max_len_formula = formulas_padded[0].shape
max_len_formula

torch.Size([629])

In [20]:
# aa = add_formula_padded(df)
# aa[0].shape

In [21]:
# df

-- useless --

In [18]:
class EquationDataset(Dataset):
    def __init__(self, csv_file, word2idx=word_to_index, transform=tf_resize_normalize):
        self.data = pd.read_csv(csv_file)
        # self.formulas_padded = add_formula_padded(self.data)
        self.transform = transform
        self.word2idx = word2idx

    def __len__(self):
        return len(self.data)

    # def __getitem__(self, idx):
    #     formula = self.formulas[idx]
    #     tokens = formula.split()
    #     indices = [self.word_to_index[word] for word in tokens]
    #     return torch.tensor(indices)

    def __getitem__(self, idx, directory='SyntheticData'):
        img_name = self.data.iloc[idx, 0]
        image = Image.open(f"./col_774_A4_2023/{directory}/images/{img_name}")
        
        formula = self.data.iloc[idx, 1].split()
        formula = ['<SOF>'] + formula + ['<EOF>']
        formula_tensor = torch.tensor([self.word2idx[word] for word in formula])
        
        pad = nn.ConstantPad1d((0, 629-formula_tensor.shape[0]), self.word2idx['<PAD>'])
        formula_tensor = pad(formula_tensor).T

        if self.transform:
            image_tensor = self.transform(image)

        return image_tensor, formula_tensor

In [19]:
formula = df.iloc[1, 1].split()
formula = ['<SOF>'] + formula + ['<EOF>']
formula_tensor = torch.tensor([word_to_index[word] for word in formula])

formula_tensor

tensor([  1, 239, 163,  26, 322, 288, 506,  15,  27, 322, 322,  50,  96,  96,
         96, 265, 414, 322,  27, 322, 322,  50,  96,  96,  96,  26, 322, 533,
         96, 470, 456, 482, 213, 410,  26, 322, 288, 430, 430,  15,  27, 322,
        322,  50,  96,  96,  96, 265, 414, 322,  27, 322, 322,  50,  96,  96,
         96,  26, 322, 533,  96, 470, 456,  77, 187, 213, 239,   2])

In [20]:
print(index_to_word[386], index_to_word[541])

\Large \theta


In [21]:
train_data = EquationDataset("./col_774_A4_2023/SyntheticData/train.csv")
test_data = EquationDataset("./col_774_A4_2023/SyntheticData/test.csv")
val_data = EquationDataset("./col_774_A4_2023/SyntheticData/val.csv")

In [22]:
# PROB: using collate fn on tuple dataset
# used lambda fn to collate on first tensor of tuple

# def custom_padseq(batch):
#     print([i[1].shape for i in batch])
    # print(batch[0][1].shape)

    # transposed = list(zip(*batch))
    # padded_tensor = pad_sequence(transposed[1], batch_first=True, padding_value=0)
    # return transposed[0] + [padded_tensor]

trial_loader = DataLoader(train_data, batch_size=32, shuffle=True)

4. Training

In [23]:
# expected Tensor as element 0 in argument 0, but got tuple

for k, (i,j) in enumerate(trial_loader):
    print(type(i))
    print("\nballe\n")
    print(j.shape)
    
    if k==1:
        break

  formula_tensor = pad(formula_tensor).T


<class 'torch.Tensor'>

balle

torch.Size([32, 629])
<class 'torch.Tensor'>

balle

torch.Size([32, 629])


In [24]:
model = EncDec(vocab, word_to_index, index_to_word)

In [53]:
def train(model: EncDec, num_epochs=10, lr=0.01, batch_size=50, criterion=nn.CrossEntropyLoss(), save=True, device="cuda"):

    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):

        if save:
            torch.save(model, "./mkc.pth")
            # torch.save(model.encoder.state_dict(), "./encoder.pth")
            # torch.save(model.decoder.state_dict(), "./decoder.pth")

        for i, (img_seq, target_seq) in enumerate(train_loader):
            
            optimizer.zero_grad()
            # encoder.zero_grad()
            # decoder.zero_grad()

            img_seq = img_seq.float()
            img_seq = img_seq.to(device)
            target_seq = target_seq.to(device)

            context = model.encoder.forward(img_seq)

            # hidden = torch.zeros(1, batch_size, hidden_size).to(device)
            # cell = torch.zeros(1, batch_size, hidden_size).to(device)
            hidden = None

            loss = 0
            #embedding where?
            output_seq = torch.tensor([[model.word_to_index['<SOF>']]] * batch_size).to(device)
            # print(output_seq.shape)
            output_thru = 0

            # print(f'target-seq {target_seq.shape}')
            for target_tok in target_seq.split(1, dim=1):
                # output, hidden, cell = decoder(context, output_seq)
                print(context.shape, output_seq.shape)
                output, hidden = model.decoder.forward(context, output_seq, hidden)

                # print(f'target-tok {target_tok.shape}')
                # print(f'outpt {output.shape}')

                # 50% teacher enforcing
                if torch.rand(1, 1) < 0.5:
                    output_seq = target_tok
                    # print('1,', output_seq.shape)
                else:
                    output_seq = output.argmax(2)
                    # print('2,',output_seq.shape)

                output = output.squeeze()
                output = output.unsqueeze(2)
                # print(f'outpt after sqiunsq {output.shape}')

                if type(output_thru) == int:
                    output_thru = output
                else:
                    output_thru = torch.cat((output_thru, output), dim = 2)


            # for t in range(target_seq.shape[1]):

            #     # output, hidden, cell = decoder(context, output_seq)
            #     output, hidden = model.decoder.forward(context, output_seq, hidden)

            #     # print(output.shape)
            #     # print(target_seq[:,t].shape)
            #     # target_seq_prob = torch.tensor()

            #     # 50% teacher enforcing
            #     if torch.rand(1,1) < 0.5:
            #         output_seq = target_seq[:, t].unsqueeze(1)
            #         # print('1,', output_seq.shape)
            #     else: 
            #         output_seq = output.argmax(2)
            #         # print('2,',output_seq.shape)

            #     output = output.squeeze()
            #     output = output.unsqueeze(2)
            #     if type(output_thru) == int:
            #         output_thru = output
            #     else:
            #         output_thru = torch.cat((output_thru, output), dim = 2)


            # print(target_seq.shape)
            # print(output_thru.shape)
            loss += criterion(output_thru, target_seq)
            # print(loss)

            if (i+1) % 10 == 0:
                # 75000/32 = 2343.75
                print(f'Epoch {epoch+1}/{num_epochs}, Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

                if (i+1)%1000 == 0 and save:
                    torch.save(model, "./mkc.pth")

            loss.backward()
            optimizer.step()

        print(f'Epoch: [{epoch + 1}/{num_epochs}], Loss: {loss.item()}')

In [54]:
modelly = EncDec(vocab, word_to_index, index_to_word)
train(modelly)

torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([50, 1])
torch.Size([50, 512, 1, 1]) torch.Size([

KeyboardInterrupt: 

In [26]:
train(model, num_epochs=2, lr=1e-4)

Epoch 1/2, Step [10/1500], Loss: 2.7699
Epoch 1/2, Step [20/1500], Loss: 0.9682
Epoch 1/2, Step [30/1500], Loss: 0.8340
Epoch 1/2, Step [40/1500], Loss: 0.7838
Epoch 1/2, Step [50/1500], Loss: 0.6893
Epoch 1/2, Step [60/1500], Loss: 0.6919


KeyboardInterrupt: 

In [None]:
train(model, num_epochs=2, device="cpu")

In [None]:
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(list(encoder.parameters()) +
#                        list(decoder.parameters()), lr=0.01)

# for epoch in range(10):
#     for i, (input_images, target_formulas) in enumerate(data_loader):
#         optimizer.zero_grad()

#         context_vectors = encoder(input_images)
#         # Initialize the hidden state for the decoder
#         hidden = torch.zeros(1, 32, 512)
#         # Initialize the cell state for the decoder
#         cell = torch.zeros(1, 32, 512)
#         # Initialize the input sequence with the <sos> token
#         output_seq = torch.tensor([[output_vocab.stoi['<sos>']] * batch_size])

#         use_teacher_forcing = True if torch.random.random() < teacher_forcing_ratio else False

#         loss = 0
#         for t in range(1, target_formulas.shape[1]):
#             output, hidden, cell = decoder(
#                 output_seq, hidden, cell, context_vectors)
#             if use_teacher_forcing:
#                 input_seq = target_formulas[:, t].unsqueeze(0)
#             else:
#                 input_seq = output.argmax(1)
#             loss += criterion(output, target_formulas[:, t])

#         loss.backward()
#         optimizer.step()

#         if i % print_every == 0:
#             print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
#                   .format(epoch+1, num_epochs, i, len(data_loader), loss.item()))

5. Prediction

In [64]:
class TestDataset(Dataset):
    def __init__(self, csv_file, transform=tf_resize_normalize):
        self.data = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, directory='SyntheticData'):
        img_name = self.data.iloc[idx, 0]
        image = Image.open(f"./col_774_A4_2023/{directory}/images/{img_name}")

        if self.transform:
            image_tensor = self.transform(image)

        return image_tensor
    

test_data = TestDataset("./col_774_A4_2023/SyntheticData/test.csv")
val_data = TestDataset("./col_774_A4_2023/SyntheticData/val.csv")

In [66]:
def predict(model: EncDec, dir_folder='SyntheticData', dir_data='test', device='cuda', batch_size=100):
    # loader = pd.read_csv(f'./col_774_A4_2023/{dir_folder}/{dir_data}.csv')
    
    if dir_data == 'test':
        loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
    elif dir_data == 'train':
        loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    else:
        loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

    final_latex = []
    for i, (images) in enumerate(loader):
        
        images = images.to(device)
        context_vectors = model.encoder.forward(images)
        hidden = None

        predicted_tokens = []
        # output_seq = torch.tensor([[model.word_to_index['<SOF>']]] * batch_size).to(device)
        input_token = torch.tensor([[model.word_to_index['<SOF>']]]*batch_size).to(device)

        for i in range(model.out_size):
            # print(context_vectors.shape)
            # print(input_token.shape)
            output, hidden = model.decoder.forward(context_vectors, input_token, hidden)
            
            predicted_token = output.argmax(dim=2)
            predicted_tokens.append(predicted_token)

            input_token = predicted_token
            
        for j in range(len(predicted_tokens)):
            predicted_latex = []
            for i in range(batch_size):
                #    print(predicted_tokens[j].shape)
                #    print(predicted_tokens[j][0,i].item())
                predicted_latex.append(model.index_to_word[predicted_tokens[j][i, 0].item()])

            final_latex.append(predicted_latex)

    return final_latex

In [1]:
test_csv = pd.read_csv('./col_774_A4_2023/SyntheticData/test.csv')
test_csv

NameError: name 'pd' is not defined

In [67]:
test_predict = predict(model)

OutOfMemoryError: CUDA out of memory. Tried to allocate 592.00 MiB (GPU 0; 8.00 GiB total capacity; 12.33 GiB already allocated; 0 bytes free; 14.36 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF