# Prepare data label for CNN model

In [5]:
# Setup 

DATA_PATH = 'data/'
IMAGES_PATH = 'data/formula_images/'

FORMULAS = DATA_PATH + 'im2latex_formulas.lst'

In [8]:
# Import

import cPickle as pickle
import nltk

from tokenizer import Tokenizer

In [7]:
# build formula list

with open(FORMULAS) as formulas_file:
    formula_list = []
    for line in formulas_file:
        formula_list.append(line)

In [18]:
# tokenize formula list

formula_tokens_list = []

for formula in formula_list:
    tokens_dict = {}
    tokenizer = Tokenizer(formula)
    for token in tokenizer.tokenize():
        # formation of token: (line_number, type, token_part, token_full)        
        # command token process
        if token[1] == 'command':
            if '\\'+token[2] not in tokens_dict:
                tokens_dict['\\'+token[2]] = 1
            else:
                tokens_dict['\\'+token[2]] += 1
        # text token process
        elif token[1] == 'text':
            for c in token[3]:
                if c not in tokens_dict:
                    tokens_dict[c] = 1
                else:
                    tokens_dict[c] += 1
    formula_tokens_list.append(tokens_dict)

In [32]:
# build token dictionary

tokens_count = {}
tokens_dict = {}

# count token frequency
for formula_tokens in formula_tokens_list:
    for token, value in formula_tokens.iteritems():
        if token not in tokens_count:
            tokens_count[token] = value
        else:
            tokens_count[token] += 1

#build token dictionary according to token frequency
index = 2 # 0 for '<eos>' 1 for 'UNK'
tokens_sorted = sorted(tokens_count, key = tokens_count.get, reverse = True)
for token in tokens_sorted:
    tokens_dict[token] = index
    index += 1

In [34]:
# build input file

with open(dataPath + 'cnn_token_dictionary.pkl', 'wb') as f:
    pickle.dump(tokens_dict, f)

# Some test

In [31]:
tokens_dict['=']

3

In [35]:
tokens_count

{'\t': 1528,
 '\n': 100091,
 '\r': 373,
 ' ': 82084,
 '!': 1614,
 '"': 31,
 '#': 1,
 '&': 4614,
 "'": 7585,
 '(': 74678,
 ')': 74609,
 '*': 3681,
 '+': 48950,
 ',': 55800,
 '-': 63134,
 '.': 45130,
 '/': 8474,
 '0': 36091,
 '1': 63404,
 '2': 65964,
 '3': 21635,
 '4': 20800,
 '5': 7968,
 '6': 7225,
 '7': 3639,
 '8': 6007,
 '9': 3048,
 ':': 9585,
 ';': 3912,
 '<': 2601,
 '=': 90023,
 '>': 2750,
 '?': 14,
 '@': 37,
 'A': 15152,
 'B': 8864,
 'C': 7393,
 'D': 9819,
 'E': 6945,
 'F': 9162,
 'G': 7398,
 'H': 7865,
 'I': 5950,
 'J': 4823,
 'K': 4851,
 'L': 9518,
 'M': 9333,
 'N': 9431,
 'O': 2656,
 'P': 6933,
 'Q': 4721,
 'R': 9792,
 'S': 11256,
 'T': 10186,
 'U': 3789,
 'V': 7032,
 'W': 4406,
 'X': 4586,
 'Y': 1987,
 'Z': 4202,
 '\\': 85,
 '\\\t': 2,
 '\\\r': 6,
 '\\ ': 9756,
 '\\!': 1626,
 '\\"': 1,
 '\\#': 58,
 '\\$': 3,
 '\\%': 4,
 '\\&': 8,
 "\\'": 5,
 '\\(': 4,
 '\\)': 4,
 '\\*': 15,
 '\\,': 20387,
 '\\-': 14,
 '\\/': 23,
 '\\:': 987,
 '\\;': 9980,
 '\\>': 246,
 '\\AA': 4,
 '\\Big': 863,

In [27]:
tokens_sorted

['\n',
 '=',
 '_',
 ' ',
 '^',
 '(',
 ')',
 '2',
 '1',
 '-',
 '\\label',
 ',',
 '+',
 '.',
 'i',
 'e',
 '\\frac',
 '0',
 'a',
 'd',
 'r',
 'n',
 '\\left',
 '\\right',
 't',
 'c',
 'm',
 'x',
 '3',
 '4',
 '\\,',
 's',
 'p',
 'g',
 'l',
 'q',
 '\\mu',
 '\\pi',
 'o',
 'A',
 'f',
 'k',
 '\\partial',
 '\\int',
 'y',
 'b',
 '\\alpha',
 '\\over',
 'S',
 'u',
 '\\cal',
 'j',
 'T',
 '\\;',
 '\\delta',
 '\\sum',
 'D',
 'R',
 '\\ ',
 'h',
 ':',
 '\\phi',
 'L',
 'N',
 'M',
 '\\sqrt',
 'F',
 '\\nu',
 'B',
 '|',
 '\\lambda',
 '/',
 'z',
 '5',
 '\\beta',
 'H',
 '\\rm',
 '\\bar',
 "'",
 'G',
 'C',
 '6',
 'V',
 'v',
 'E',
 'P',
 '\\gamma',
 '\\sigma',
 '\\{',
 '\\theta',
 '\\\\',
 '\\tilde',
 '\\epsilon',
 '8',
 'I',
 '\\begin',
 '\\end',
 '\\}',
 '\\rho',
 '\\psi',
 '\\quad',
 '\\infty',
 '\\omega',
 '\\hat',
 '\\tau',
 'K',
 'J',
 '\\eta',
 'Q',
 '&',
 'X',
 'w',
 'W',
 '\\Gamma',
 'Z',
 '\\rangle',
 '\\Phi',
 '\\equiv',
 '\\Delta',
 ';',
 '\\Lambda',
 '\\qquad',
 '\\xi',
 'U',
 '*',
 '7',
 '\\bf',
 

In [19]:
formula_tokens_list[1]

{'\n': 1,
 ' ': 11,
 '(': 2,
 ')': 2,
 '+': 4,
 '-': 3,
 '.': 1,
 '1': 5,
 '2': 12,
 ':': 1,
 '=': 1,
 '\\,': 1,
 '\\alpha': 2,
 '\\label': 1,
 '\\lbrace': 1,
 '\\over': 5,
 '\\rbrace': 1,
 '\\theta': 4,
 '\\varphi': 1,
 '^': 12,
 'c': 2,
 'd': 5,
 'e': 1,
 'i': 1,
 'n': 1,
 'o': 2,
 'p': 1,
 'q': 3,
 'r': 5,
 's': 6,
 't': 1}