# im2latex(S): Tokenizer

&copy; Copyright 2017 Sumeet S Singh

    This file is part of im2latex solution by Sumeet S Singh.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Affero GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Affero GNU General Public License for more details.

    You should have received a copy of the Affero GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import pandas as pd
import os
import re
import codecs
from IPython.display import display
from six.moves import cPickle as pickle
import string
from PIL import Image

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 100
pd.options.display.width = 160
data_dir = '../data/generated2'
image_folder = os.path.join(data_dir,'formula_images')

#### Characterset Chart for Reference
##### ASCII Control Characters

                        CTRL   (^D means to hold the CTRL key and hit d)
    Oct  Dec Char  Hex  Key     Comments
    \000   0  NUL  \x00  ^@ \0 (Null byte)
    \001   1  SOH  \x01  ^A    (Start of heading)
    \002   2  STX  \x02  ^B    (Start of text)
    \003   3  ETX  \x03  ^C    (End of text) (see: UNIX keyboard CTRL)
    \004   4  EOT  \x04  ^D    (End of transmission) (see: UNIX keyboard CTRL)
    \005   5  ENQ  \x05  ^E    (Enquiry)
    \006   6  ACK  \x06  ^F    (Acknowledge)
    \007   7  BEL  \x07  ^G    (Ring terminal bell)
    \010   8   BS  \x08  ^H \b (Backspace)  (\b matches backspace inside [] only)
                                            (see: UNIX keyboard CTRL)
    \011   9   HT  \x09  ^I \t (Horizontal tab)
    \012  10   LF  \x0A  ^J \n (Line feed)  (Default UNIX NL) (see End of Line below)
    \013  11   VT  \x0B  ^K    (Vertical tab)
    \014  12   FF  \x0C  ^L \f (Form feed)
    \015  13   CR  \x0D  ^M \r (Carriage return)  (see: End of Line below)
    \016  14   SO  \x0E  ^N    (Shift out)
    \017  15   SI  \x0F  ^O    (Shift in)
    \020  16  DLE  \x10  ^P    (Data link escape)
    \021  17  DC1  \x11  ^Q    (Device control 1) (XON) (Default UNIX START char.)
    \022  18  DC2  \x12  ^R    (Device control 2)
    \023  19  DC3  \x13  ^S    (Device control 3) (XOFF)  (Default UNIX STOP char.)
    \024  20  DC4  \x14  ^T    (Device control 4)
    \025  21  NAK  \x15  ^U    (Negative acknowledge)  (see: UNIX keyboard CTRL)
    \026  22  SYN  \x16  ^V    (Synchronous idle)
    \027  23  ETB  \x17  ^W    (End of transmission block)
    \030  24  CAN  \x18  ^X    (Cancel)
    \031  25  EM   \x19  ^Y    (End of medium)
    \032  26  SUB  \x1A  ^Z    (Substitute character)
    \033  27  ESC  \x1B  ^[    (Escape)
    \034  28  FS   \x1C  ^\    (File separator, Information separator four)
    \035  29  GS   \x1D  ^]    (Group separator, Information separator three)
    \036  30  RS   \x1E  ^^    (Record separator, Information separator two)
    \037  31  US   \x1F  ^_    (Unit separator, Information separator one)
    \177 127  DEL  \x7F  ^?    (Delete)  (see: UNIX keyboard CTRL)
    
    string.printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
    string.whitespace = '\t\n\x0b\x0c\r '

In [3]:
def loadImageList(filepath):
    df=pd.read_table(filepath, header=None, 
                     names=['id', 'name', 'type'], 
                     delim_whitespace=True, 
                     usecols=('id','name'), 
                     dtype={'id':int, 'name':str, 'type':str})
    return df

def getImageDetails(data_dir):
    try:
        image_details = pd.read_csv(os.path.join(data_dir,'image_details.csv'),
                                   index_col=0)
        return image_details
    except:
        pass
    widths=[]
    heights=[]
    filenames=[]
    imageList = loadImageList(os.path.join(data_dir,'im2latex.lst'))
    for i in range(imageList.shape[0]):
        try:
            image_name = imageList.iloc[i,1] + '.png'
            im = Image.open(os.path.join(image_folder,image_name))
            widths.append(im.size[0])
            heights.append(im.size[1])
            filenames.append(image_name)
        except Exception as e:
            #print(e)
            pass
    print(len(widths), len(filenames), len(heights))
    dff = pd.DataFrame({'filename':filenames, 'width':widths, 'height':heights})
    dff.to_csv(os.path.join(data_dir,'image_details.csv'))
    return dff


def getDatasetDetails(data_dir):
    try:
        image_details = pd.read_pickle(os.path.join(data_dir,'df_image_details.pkl'))
        return image_details
    except:
        pass
    widths=[]
    heights=[]
    formula_lens=[]
    datasetDF = pd.read_pickle(os.path.join(data_dir,'im2latex_map.pkl'))
    for _, row in datasetDF.iterrows():
        image_name = row.image
        im = Image.open(os.path.join(image_folder,image_name))
        widths.append(im.size[0])
        heights.append(im.size[1])
        formula_lens.append(len(row.latex))
    print(len(widths), len(heights))
    datasetDF = datasetDF.assign(width=widths, height=heights, formula_len=formula_lens)
    datasetDF.to_pickle(os.path.join(data_dir,'df_image_details.pkl'))
    return datasetDF

df_image_details = getDatasetDetails(data_dir)

In [6]:
def get_df_clean(data_dir_, df_image_details_):
    NONPRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'
    try:
        return pd.read_pickle(os.path.join(data_dir_,'df_clean.pkl'))
    except Exception as e:
        print e    
        df = df_image_details_
        # Make sure everything's ascii
        # Coalesce whitespace to a single space
        # Strip whitespace from the sides
        # Strip percent signs from the sides
        # Discard strings with non-printable characters
        # Discard strings with embedded percent signs (because textogif ignores everything after the % sign)
        cleaned = df.latex.str.decode('ascii').str.encode('ascii').str.replace(r"\s+", ' ').str.strip().str.strip('%')
        df = df.assign(latex_ascii=cleaned, latex_ascii_len=cleaned.str.len())
        bad1 = df.latex.str.contains(NONPRINTABLE_CHARS_RE)
        print 'nonprintables #: ', bad1.shape
        bad2 = df.latex.str.contains(PERCENTS_RE)
        print 'percents #: ', bad1.shape
        good = ~(bad1 | bad2)
        print 'good #: ', good.shape
        df = df[good]
        df.to_pickle((os.path.join(data_dir_,'df_clean.pkl')))
        return df

In [7]:
df_clean = get_df_clean(data_dir, df_image_details)
display(df_clean.shape)

NameError: name 'df_image_details' is not defined

In [6]:
df_clean

Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_ascii_len
0,450f7c1496143fd_basic.png,450f7c1496143fd_basic,\int_{-\epsilon}^\infty dl\: {\rm e}^{-l\zeta}\t\int_{-\epsilon}^\infty dl' {\rm e}^{-l'\zeta}\t...,178,78,738,\int_{-\epsilon}^\infty dl\: {\rm e}^{-l\zeta} \int_{-\epsilon}^\infty dl' {\rm e}^{-l'\zeta} ll...,177
1,868d5037af9e4b4_basic.png,868d5037af9e4b4_basic,ds^{2} = (1 - {qcos\theta\over r})^{2\over 1 + \alpha^{2}}\lbrace dr^2+r^2d\theta^2+r^2sin^2\the...,199,94,962,ds^{2} = (1 - {qcos\theta\over r})^{2\over 1 + \alpha^{2}}\lbrace dr^2+r^2d\theta^2+r^2sin^2\the...,198
2,af0b6c3ee18804a_basic.png,af0b6c3ee18804a_basic,\widetilde\gamma_{\rm hopf}\simeq\sum_{n>0}\widetilde{G}_n{(-a)^n\over2^{2n-1}}\label{H4},89,87,291,\widetilde\gamma_{\rm hopf}\simeq\sum_{n>0}\widetilde{G}_n{(-a)^n\over2^{2n-1}}\label{H4},89
3,dda45eca6d32fa3_basic.png,dda45eca6d32fa3_basic,"({\cal L}_a g)_{ij} = 0, \ \ \ \ ({\cal L}_a H)_{ijk} = 0 ,",59,35,405,"({\cal L}_a g)_{ij} = 0, \ \ \ \ ({\cal L}_a H)_{ijk} = 0 ,",59
4,67eb249ed1c20d2_basic.png,67eb249ed1c20d2_basic,S_{stat} = 2\pi \sqrt{N_5^{(1)} N_5^{(2)} N_5^{(3)}} \left(\sqrt{n} +\sqrt{\bar{n}}\right)\label...,100,60,521,S_{stat} = 2\pi \sqrt{N_5^{(1)} N_5^{(2)} N_5^{(3)}} \left(\sqrt{n} +\sqrt{\bar{n}}\right)\label...,100
5,89ef1bacdfcca24_basic.png,89ef1bacdfcca24_basic,"\hat N_3 = \sum\sp f_{j=1}a_j\sp {\dagger} a_j \,. \label{c5}",61,102,206,"\hat N_3 = \sum\sp f_{j=1}a_j\sp {\dagger} a_j \,. \label{c5}",61
6,fd81bdf1ebcf996_basic.png,fd81bdf1ebcf996_basic,+ \int\!\!d^D\!z_1 d^D\!z_2 d^D\!z_3 \left. \frac{\delta^2 W}{\delta j(x) \delta j(z_1)} \...,323,81,1077,"+ \int\!\!d^D\!z_1 d^D\!z_2 d^D\!z_3 \left. \frac{\delta^2 W}{\delta j(x) \delta j(z_1)} \, \fra...",273
7,6135540f1af5ff3_basic.png,6135540f1af5ff3_basic,"\,^{*}d\,^{*}H=\kappa \,^{*}d\phi = J_B . \label{bfm19}",56,30,282,"\,^{*}d\,^{*}H=\kappa \,^{*}d\phi = J_B . \label{bfm19}",55
8,92656e604bed774_basic.png,92656e604bed774_basic,{\phi''\over A} +{1\over A}\left( -{1\over 2}{A'\over A}+2{B'\over B}+{2\over r}\right)\phi'-{2 ...,159,80,817,{\phi''\over A} +{1\over A}\left( -{1\over 2}{A'\over A}+2{B'\over B}+{2\over r}\right)\phi'-{2 ...,159
9,892756cc4445c69_basic.png,892756cc4445c69_basic,\label{maxw}\partial_{\mu} (F^{\mu\nu}-ej^{\mu}x^{\nu})=0 .,59,35,296,\label{maxw}\partial_{\mu} (F^{\mu\nu}-ej^{\mu}x^{\nu})=0 .,59


In [4]:
class TokenDict(object):
    def __init__(self):
        self._tokens = {}
    
    def account(self, token_list):
        for token in token_list:
            self._count(token)
            
    def _count(self, token):
        if token in self._tokens:
            self._tokens[token] += 1
        else:
            self._tokens[token] = 1
        return 1
    
    @property
    def dict(self):
        return self._tokens
    
    @property
    def tokens(self):
        return sorted(self._tokens.keys())
            
def get_vocabulary(df_, data_dir_):
    try:
        df_vocab = pd.read_pickle(os.path.join(data_dir_,'df_vocab.pkl'))
        df_tokenized = pd.read_pickle(os.path.join(data_dir_,'df_tokenized.pkl'))
    except Exception as e:
        print e
        ## Split latex into tokens. Isolate latex commands first - i.e.
        ## (optionally even number of backslashes) followed by one backslash followed by letters.
        ## Everything else is a one-character token in itself.
        LATEX_RE = re.compile(r"(?:(?<=\\\\\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\)\\[a-zA-Z]+)|(?:(?<!\\)\\[a-zA-Z]+)|.")
        sr_token = df_.latex_ascii.str.findall(LATEX_RE)
        df_tokenized = df_.assign(latex_tokenized=sr_token)
        ## Aggregate the tokens
        vocab = TokenDict()
        sr_token.agg(lambda l: vocab.account(l))
        ## Sort and save
        tokens = []; 
        count = []
        for t in vocab.tokens:
            tokens.append(t)
            count.append(vocab.dict[t])
        ## Assign token-ids. Start with 2. RESERVE 0 as a 'NULL' token, 1 as BEGIN-SEQUENCE token
        df_vocab = pd.DataFrame({'id':range(2,len(tokens)+2), 'freq':count}, index=tokens, columns=['id', 'freq'])
        ## Now ensure that space is the last ID. This is required by the CTC decoder.
        max_id = df_vocab.id.max()
        #print 'max_id =', max_id, type(max_id)
        max_idx = df_vocab[df_vocab.id == max_id].index[0]
        #print 'max_idx=', max_idx, type(max_idx)
        space_id = df_vocab.loc[' '].id
        #print 'space_id=', space_id, type(space_id)        ## Persist to disk
        df_vocab.loc[' '].id = max_id
        df_vocab.loc[max_idx].id = space_id
        print 'swapped ids %d and %d'%(max_id, space_id)
        display(df_vocab.loc[' '])
        display(df_vocab.loc[max_idx])
        df_vocab.to_pickle(os.path.join(data_dir_,'df_vocab.pkl'))
        df_tokenized.to_pickle(os.path.join(data_dir_,'df_tokenized.pkl'))
        
    return df_vocab, df_tokenized

In [9]:
try:
    df_vocab = pd.read_pickle(os.path.join(data_dir,'df_vocab.pkl'))
    df_tokenized = pd.read_pickle(os.path.join(data_dir,'df_tokenized.pkl'))
except:
    df_vocab, df_tokenized = get_vocabulary(df_clean, data_dir)

In [10]:
dict_vocab = df_vocab.to_dict()
if not os.path.exists(os.path.join(data_dir, 'dict_vocab.pkl')):
    with open(os.path.join(data_dir, 'dict_vocab.pkl'), 'wb') as f:
        pickle.dump(dict_vocab, f, pickle.HIGHEST_PROTOCOL)

In [20]:
def reverse_dict(d):
    r = {}
    for k in d.keys():
        v = d[k]
        r[v] = k
    return r
if not os.path.exists(os.path.join(data_dir, 'dict_id2word.pkl')):
    with open(os.path.join(data_dir, 'dict_id2word.pkl'), 'wb') as f:
        dict_id2word = reverse_dict(dict_vocab['id'])
        dict_id2word[0] = '\\eos'
        dict_id2word[1] = '\\bos'
        pickle.dump(dict_id2word, f, pickle.HIGHEST_PROTOCOL)

In [16]:
df_vocab[df_vocab.id==525]

Unnamed: 0,id,freq
_,525,358442


In [10]:
df_vocab[df_vocab.id==18]

Unnamed: 0,id,freq
1,18,157366


In [11]:
df_vocab[df_vocab.id==553]

Unnamed: 0,id,freq
{,553,711181


In [13]:
df_vocab[df_vocab.id==555]

Unnamed: 0,id,freq
},555,710399


In [24]:
display('NULL=0, BEGIN=1, space=%d'%(df_vocab.loc[' '].id,))
display(df_vocab.sort_values(by='freq', ascending=False))

'NULL=0, BEGIN=1, space=556'

Unnamed: 0,id,freq
,556,863152
{,553,711181
},555,710399
_,525,358442
^,524,286513
),10,204723
(,9,204696
2,19,201454
",",13,164199
1,18,157366


In [25]:
#df_vocab[df_vocab.index.str.contains(r'\\')]

In [26]:
print df_clean.latex_ascii[df_clean.latex.str.contains(r'\\\\\\\\\\\\\\\\\\\\')].count()

0


In [27]:
def get_word2id(df_tokenized_, df_vocab_, data_dir_):
    try:
        return pd.read_pickle(os.path.join(data_dir, 'df_word2id.pkl'))
    except Exception as e:
        print e        
        word2id = df_vocab_.id.to_dict()
        sr_word2id = df_tokenized_.latex_tokenized.apply(lambda l: map(lambda t: word2id[t], l))
        df_ = df_tokenized_.assign(word2id=sr_word2id, word2id_len=sr_word2id.str.len())
        df_.to_pickle(os.path.join(data_dir_, 'df_word2id.pkl'))
        return df_

In [28]:
df_word2id = get_word2id(df_tokenized, df_vocab, data_dir)

In [29]:
df_word2id.shape

(99600, 11)

In [39]:
s = [35,524,553,42,525,18,42,525,19,42,525,20,42,525,21,555,525,19,30,39,524,553,42,525,19,42,525,20,42,525,18,42,525,21,555]
type(df_word2id.word2id.values)

numpy.ndarray

### Temp

In [21]:
dict_id2word = pd.read_pickle(os.path.join(data_dir, 'dict_id2word.pkl'))

In [22]:
dict_id2word

{0: '\\eos',
 1: '\\bos',
 2: '~',
 3: '!',
 4: '"',
 5: '#',
 6: '$',
 7: '&',
 8: "'",
 9: '(',
 10: ')',
 11: '*',
 12: '+',
 13: ',',
 14: '-',
 15: '.',
 16: '/',
 17: '0',
 18: '1',
 19: '2',
 20: '3',
 21: '4',
 22: '5',
 23: '6',
 24: '7',
 25: '8',
 26: '9',
 27: ':',
 28: ';',
 29: '<',
 30: '=',
 31: '>',
 32: '?',
 33: '@',
 34: 'A',
 35: 'B',
 36: 'C',
 37: 'D',
 38: 'E',
 39: 'F',
 40: 'G',
 41: 'H',
 42: 'I',
 43: 'J',
 44: 'K',
 45: 'L',
 46: 'M',
 47: 'N',
 48: 'O',
 49: 'P',
 50: 'Q',
 51: 'R',
 52: 'S',
 53: 'T',
 54: 'U',
 55: 'V',
 56: 'W',
 57: 'X',
 58: 'Y',
 59: 'Z',
 60: '[',
 61: '\\',
 62: '\\AA',
 63: '\\Big',
 64: '\\Bigg',
 65: '\\Biggl',
 66: '\\Biggm',
 67: '\\Biggr',
 68: '\\Bigl',
 69: '\\Bigm',
 70: '\\Bigr',
 71: '\\C',
 72: '\\Delta',
 73: '\\Downarrow',
 74: '\\Flows',
 75: '\\Gamma',
 76: '\\Huge',
 77: '\\Im',
 78: '\\J',
 79: '\\L',
 80: '\\LARGE',
 81: '\\Lambda',
 82: '\\Large',
 83: '\\Leftrightarrow',
 84: '\\Longleftarrow',
 85: '\\Longleft