# im2latex(S): Tokenizer

&copy; Copyright 2017 Sumeet S Singh

    This file is part of im2latex solution by Sumeet S Singh.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Affero GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Affero GNU General Public License for more details.

    You should have received a copy of the Affero GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import pandas as pd
import os
import re
import codecs
from IPython.display import display
from six.moves import cPickle as pickle
import string
from PIL import Image

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 100
pd.options.display.width = 160
data_dir = data_folder = '../data/dataset3'
image_folder = image_dir = os.path.join(data_dir,'formula_images')
notebook_dir = 'step2'

#### Characterset Chart for Reference
##### ASCII Control Characters

                        CTRL   (^D means to hold the CTRL key and hit d)
    Oct  Dec Char  Hex  Key     Comments
    \000   0  NUL  \x00  ^@ \0 (Null byte)
    \001   1  SOH  \x01  ^A    (Start of heading)
    \002   2  STX  \x02  ^B    (Start of text)
    \003   3  ETX  \x03  ^C    (End of text) (see: UNIX keyboard CTRL)
    \004   4  EOT  \x04  ^D    (End of transmission) (see: UNIX keyboard CTRL)
    \005   5  ENQ  \x05  ^E    (Enquiry)
    \006   6  ACK  \x06  ^F    (Acknowledge)
    \007   7  BEL  \x07  ^G    (Ring terminal bell)
    \010   8   BS  \x08  ^H \b (Backspace)  (\b matches backspace inside [] only)
                                            (see: UNIX keyboard CTRL)
    \011   9   HT  \x09  ^I \t (Horizontal tab)
    \012  10   LF  \x0A  ^J \n (Line feed)  (Default UNIX NL) (see End of Line below)
    \013  11   VT  \x0B  ^K    (Vertical tab)
    \014  12   FF  \x0C  ^L \f (Form feed)
    \015  13   CR  \x0D  ^M \r (Carriage return)  (see: End of Line below)
    \016  14   SO  \x0E  ^N    (Shift out)
    \017  15   SI  \x0F  ^O    (Shift in)
    \020  16  DLE  \x10  ^P    (Data link escape)
    \021  17  DC1  \x11  ^Q    (Device control 1) (XON) (Default UNIX START char.)
    \022  18  DC2  \x12  ^R    (Device control 2)
    \023  19  DC3  \x13  ^S    (Device control 3) (XOFF)  (Default UNIX STOP char.)
    \024  20  DC4  \x14  ^T    (Device control 4)
    \025  21  NAK  \x15  ^U    (Negative acknowledge)  (see: UNIX keyboard CTRL)
    \026  22  SYN  \x16  ^V    (Synchronous idle)
    \027  23  ETB  \x17  ^W    (End of transmission block)
    \030  24  CAN  \x18  ^X    (Cancel)
    \031  25  EM   \x19  ^Y    (End of medium)
    \032  26  SUB  \x1A  ^Z    (Substitute character)
    \033  27  ESC  \x1B  ^[    (Escape)
    \034  28  FS   \x1C  ^\    (File separator, Information separator four)
    \035  29  GS   \x1D  ^]    (Group separator, Information separator three)
    \036  30  RS   \x1E  ^^    (Record separator, Information separator two)
    \037  31  US   \x1F  ^_    (Unit separator, Information separator one)
    \177 127  DEL  \x7F  ^?    (Delete)  (see: UNIX keyboard CTRL)
    
    string.printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
    string.whitespace = '\t\n\x0b\x0c\r '

In [3]:
def makeDatasetDetails(data_dir, overwrite=False):
    pickle_path = os.path.join(data_dir, notebook_dir, 'df2_dataset_details.pkl')
    if (not overwrite) and os.path.exists(pickle_path):
        raise Exception('File %s already exists'%pickle_path)
        
    widths=[]
    heights=[]
    formula_lens=[]
    datasetDF = pd.read_pickle(os.path.join(data_dir,'im2latex_dataset_map.df.pkl'))
    for _, row in datasetDF.iterrows():
        image_name = row.image
        im = Image.open(os.path.join(image_folder,image_name))
        widths.append(im.size[0])
        heights.append(im.size[1])
        formula_lens.append(len(row.latex))
    print(len(widths), len(heights))
    datasetDF = datasetDF.assign(width=widths, height=heights, formula_len=formula_lens)
    if not os.path.exists(os.path.join(data_dir, notebook_dir)):
        os.makedirs(os.path.join(data_dir, notebook_dir))
    datasetDF.to_pickle(pickle_path)
    return datasetDF
    
def getDatasetDetails(data_dir):
    image_details = pd.read_pickle(os.path.join(data_dir, notebook_dir, 'df2_dataset_details.pkl'))
    return image_details

In [4]:
# df2_image_details = makeDatasetDetails(data_dir)
df2_image_details = getDatasetDetails(data_dir)

In [5]:
def get_df_clean(data_dir_, df_image_details_):
    NONPRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'
    return pd.read_pickle(os.path.join(data_dir_, notebook_dir, 'df3_clean.pkl'))
    
def make_df_clean(data_dir_, df_image_details_, overwrite=False):
    NONPRINTABLE_CHARS_RE = r'[^\\' + string.printable + r']'
    DELETE_RE = re.compile(r".\x7F")
    PERCENTS_RE = r'%'
    pickle_path = os.path.join(data_dir_, notebook_dir, 'df3_clean.pkl')
    if (overwrite==False) and (os.path.exists(pickle_path)):
        raise Exception('File %s already exits'%pickle_path)
    else:
        df = df_image_details_
        # Make sure everything's ascii
        # Coalesce whitespace to a single space
        # Strip whitespace from the sides
        # Strip percent signs from the sides
        # Discard strings with non-printable characters
        # Discard strings with embedded percent signs (because textogif ignores everything after the % sign)
        cleaned = df.latex.str.decode('ascii').str.encode('ascii').str.replace(r"\s+", ' ').str.strip().str.strip('%')
        df = df.assign(latex_ascii=cleaned, latex_ascii_len=cleaned.str.len())
        bad1 = df.latex.str.contains(NONPRINTABLE_CHARS_RE)
        print 'nonprintables #: ', bad1.sum()
        bad2 = df.latex.str.contains(PERCENTS_RE)
        print 'percents #: ', bad1.sum()
        good = ~(bad1 | bad2)
        print 'good #: ', good.sum()
        df = df[good]
        df.to_pickle(pickle_path)
        return df

In [7]:
df3_clean = get_df_clean(data_dir, df2_image_details)
display(df3_clean.shape)

(100700, 8)

In [8]:
df3_clean

Unnamed: 0,image,formula_name,latex,formula_len,height,width,latex_ascii,latex_ascii_len
0,23be72ded29e9b4_basic.png,23be72ded29e9b4_basic,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331,78,738,\int _ { - \epsilon } ^ { \infty } d l \: \mathrm { e } ^ { - l \zeta } \int _ { - \epsilon } ^ ...,331
1,8c904c5d9c7bd63_basic.png,8c904c5d9c7bd63_basic,d s ^ { 2 } = ( 1 - { \frac { q c o s \theta } { r } } ) ^ { \frac { 2 } { 1 + \alpha ^ { 2 } } ...,326,94,962,d s ^ { 2 } = ( 1 - { \frac { q c o s \theta } { r } } ) ^ { \frac { 2 } { 1 + \alpha ^ { 2 } } ...,326
2,1f111caa1be3516_basic.png,1f111caa1be3516_basic,\widetilde \gamma _ { \mathrm { h o p f } } \simeq \sum _ { n > 0 } \widetilde { G } _ { n } { \...,142,87,291,\widetilde \gamma _ { \mathrm { h o p f } } \simeq \sum _ { n > 0 } \widetilde { G } _ { n } { \...,142
3,76287b1b483a0d2_basic.png,76287b1b483a0d2_basic,"( { \cal L } _ { a } g ) _ { i j } = 0 , \ \ \ \ ( { \cal L } _ { a } H ) _ { i j k } = 0 ,",91,35,405,"( { \cal L } _ { a } g ) _ { i j } = 0 , \ \ \ \ ( { \cal L } _ { a } H ) _ { i j k } = 0 ,",91
4,7483d06c1e14dcb_basic.png,7483d06c1e14dcb_basic,S _ { s t a t } = 2 \pi \sqrt { N _ { 5 } ^ { ( 1 ) } N _ { 5 } ^ { ( 2 ) } N _ { 5 } ^ { ( 3 ) ...,149,60,521,S _ { s t a t } = 2 \pi \sqrt { N _ { 5 } ^ { ( 1 ) } N _ { 5 } ^ { ( 2 ) } N _ { 5 } ^ { ( 3 ) ...,149
5,8135cab7e6b93ae_basic.png,8135cab7e6b93ae_basic,"\hat { N } _ { 3 } = \sum \sp f _ { j = 1 } a _ { j } \sp { \dagger } a _ { j } \, .",84,102,206,"\hat { N } _ { 3 } = \sum \sp f _ { j = 1 } a _ { j } \sp { \dagger } a _ { j } \, .",84
6,67808bc314fa181_basic.png,67808bc314fa181_basic,+ \int \! \! d ^ { D } \! z _ { 1 } d ^ { D } \! z _ { 2 } d ^ { D } \! z _ { 3 } \left. \frac {...,428,81,1077,+ \int \! \! d ^ { D } \! z _ { 1 } d ^ { D } \! z _ { 2 } d ^ { D } \! z _ { 3 } \left. \frac {...,428
7,2278ffa4860ed9b_basic.png,2278ffa4860ed9b_basic,"\, ^ { * } d \, ^ { * } H = \kappa \, ^ { * } d \phi = J _ { B } .",66,31,282,"\, ^ { * } d \, ^ { * } H = \kappa \, ^ { * } d \phi = J _ { B } .",66
8,1d7098797009973_basic.png,1d7098797009973_basic,{ \frac { \phi ^ { \prime \prime } } { A } } + { \frac { 1 } { A } } \left( - { \frac { 1 } { 2 ...,316,80,817,{ \frac { \phi ^ { \prime \prime } } { A } } + { \frac { 1 } { A } } \left( - { \frac { 1 } { 2 ...,316
9,f4f265238eb9724_basic.png,f4f265238eb9724_basic,\partial _ { \mu } ( F ^ { \mu \nu } - e j ^ { \mu } x ^ { \nu } ) = 0 .,72,35,296,\partial _ { \mu } ( F ^ { \mu \nu } - e j ^ { \mu } x ^ { \nu } ) = 0 .,72


In [16]:
df_clean = df3_clean[['image', 'formula_name', 'height', 'width', 'latex_ascii', 'latex_ascii_len']]

In [20]:
class TokenDict(object):
    def __init__(self):
        self._tokens = {}
    
    def account(self, token_list):
        for token in token_list:
            self._count(token)
            
    def _count(self, token):
        if token in self._tokens:
            self._tokens[token] += 1
        else:
            self._tokens[token] = 1
        return 1
    
    @property
    def dict(self):
        return self._tokens
    
    @property
    def tokens(self):
        return sorted(self._tokens.keys())
            
def make_vocabulary(df_, data_dir_, overwrite=False):
    pickle_path_vocab = os.path.join(data_dir_, notebook_dir, 'df4_vocab.pkl')
    pickle_path_tokenized = os.path.join(data_dir_, notebook_dir, 'df4_tokenized.pkl')
    if (not overwrite) and (os.path.exists(pickle_path_vocab) or os.path.exists(pickle_path_tokenized)):
        raise Exception('Files already exist')
    else:
        ## Split latex into tokens. Isolate latex commands first - i.e.
        ## (optionally even number of backslashes) followed by one backslash followed by letters.
        ## Everything else is a one-character token in itself.
        LATEX_RE = re.compile(r"(?:(?<=\\\\\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\\\\\)\\[a-zA-Z]+)|(?:(?<=\\\\)\\[a-zA-Z]+)|(?:(?<!\\)\\[a-zA-Z]+)|.")
        sr_token = df_.latex_ascii.str.findall(LATEX_RE)
        df_tokenized = df_.assign(latex_tokenized=sr_token)
        ## Aggregate the tokens
        vocab = TokenDict()
        sr_token.agg(lambda l: vocab.account(l))
        ## Sort and save
        tokens = []; 
        count = []
        for t in vocab.tokens:
            tokens.append(t)
            count.append(vocab.dict[t])
        ## Assign token-ids. Start with 2. RESERVE 0 as a 'NULL' token, 1 as BEGIN-SEQUENCE token
        df_vocab = pd.DataFrame({'id':range(2,len(tokens)+2), 'freq':count}, index=tokens, columns=['id', 'freq'])
        ## Now ensure that space is the last ID.
        ## This is required by the CTC decoder if we wanted to use space as blank-token for CTC
        max_id = df_vocab.id.max()
        #print 'max_id =', max_id, type(max_id)
        max_idx = df_vocab[df_vocab.id == max_id].index[0]
        #print 'max_idx=', max_idx, type(max_idx)
        space_id = df_vocab.loc[' '].id
        #print 'space_id=', space_id, type(space_id)
        df_vocab.loc[' '].id = max_id
        df_vocab.loc[max_idx].id = space_id
        print 'swapped ids %d and %d'%(max_id, space_id)
        display(df_vocab.loc[' '])
        display(df_vocab.loc[max_idx])
        df_vocab.to_pickle(pickle_path_vocab)
        df_tokenized.to_pickle(pickle_path_tokenized)
        
    return df_vocab, df_tokenized

def get_vocabulary(df_, data_dir_):
    df_vocab = pd.read_pickle(os.path.join(data_dir_, notebook_dir, 'df_vocab.pkl'))
    df_tokenized = pd.read_pickle(os.path.join(data_dir_, notebook_dir, 'df_tokenized.pkl'))        
    return df_vocab, df_tokenized

In [21]:
try:
    df_vocab, df_tokenized = get_vocabulary(df_clean, data_dir)
except:
    df_vocab, df_tokenized = make_vocabulary(df_clean, data_dir)

swapped ids 449 and 2


id          449
freq    6491267
Name:  , dtype: int64

id          2
freq    30529
Name: ~, dtype: int64

In [25]:
df_vocab.sort_values(by='freq', ascending=False)

Unnamed: 0,id,freq
,449,6491267
{,446,1135182
},448,1134337
_,418,356353
^,417,303405
(,8,204056
),9,203870
2,18,191219
\,60,165742
",",12,162299


In [None]:
dict_vocab = df_vocab.to_dict()
if not os.path.exists(os.path.join(data_dir, 'dict_vocab.pkl')):
    with open(os.path.join(data_dir, 'dict_vocab.pkl'), 'wb') as f:
        pickle.dump(dict_vocab, f, pickle.HIGHEST_PROTOCOL)

In [None]:
def reverse_dict(d):
    r = {}
    for k in d.keys():
        v = d[k]
        r[v] = k
    return r
if not os.path.exists(os.path.join(data_dir, 'dict_id2word.pkl')):
    with open(os.path.join(data_dir, 'dict_id2word.pkl'), 'wb') as f:
        dict_id2word = reverse_dict(dict_vocab['id'])
        dict_id2word[0] = '\\eos'
        dict_id2word[1] = '\\bos'
        pickle.dump(dict_id2word, f, pickle.HIGHEST_PROTOCOL)

In [None]:
df_vocab[df_vocab.id==525]

In [None]:
df_vocab[df_vocab.id==18]

In [None]:
df_vocab[df_vocab.id==553]

In [None]:
df_vocab[df_vocab.id==555]

In [None]:
display('NULL=0, BEGIN=1, space=%d'%(df_vocab.loc[' '].id,))
display(df_vocab.sort_values(by='freq', ascending=False))

In [None]:
#df_vocab[df_vocab.index.str.contains(r'\\')]

In [None]:
print df_clean.latex_ascii[df_clean.latex.str.contains(r'\\\\\\\\\\\\\\\\\\\\')].count()

In [None]:
def get_word2id(df_tokenized_, df_vocab_, data_dir_):
    try:
        return pd.read_pickle(os.path.join(data_dir, 'df_word2id.pkl'))
    except Exception as e:
        print e        
        word2id = df_vocab_.id.to_dict()
        sr_word2id = df_tokenized_.latex_tokenized.apply(lambda l: map(lambda t: word2id[t], l))
        df_ = df_tokenized_.assign(word2id=sr_word2id, word2id_len=sr_word2id.str.len())
        df_.to_pickle(os.path.join(data_dir_, 'df_word2id.pkl'))
        return df_

In [None]:
df_word2id = get_word2id(df_tokenized, df_vocab, data_dir)

### Temp

In [None]:
df_vocab = pd.read_pickle(os.path.join(data_dir,'df_vocab.pkl'))

In [None]:
df_vocab

In [None]:
df_vocab.id.loc[':']

In [None]:
df_word2id = pd.read_pickle(os.path.join(data_dir, 'df_word2id.pkl'))
df_word2id

In [None]:
dict_id2word = pd.read_pickle(os.path.join(data_dir, 'dict_id2word.pkl'))

In [None]:
dict_id2word