In [1]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
pd.set_option('display.max_rows', 1000)

In [2]:
mono_nonlinear = pd.read_csv('matching_table/mono_match_nonlinear.csv')['csv'].values
mono_nonlinear = np.unique(mono_nonlinear)

In [3]:
mono_linear = pd.read_csv('matching_table/mono_match.csv')['csv'].values
mono_linear = np.unique(mono_linear)

In [4]:
all_mono = np.unique(np.concatenate([mono_linear, mono_nonlinear]))

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

In [6]:
def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer

In [7]:
def train_tokenizer(files, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save("./tokenizer-trained.json")
    tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer

In [9]:
# for i in all_mono:
    
#     with open(os.path.join('matching_table/mono_list/') + i + '.txt', 'x') as f:
#         f.write(i)
#         f.write('\n')


In [10]:
paths = 'matching_table/mono_list/'
from pathlib import Path
paths = [str(x) for x in Path('matching_table/mono_list/').glob('**/*.txt')]

In [23]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=32, min_frequency=1,
                limit_alphabet=32, wordpieces_prefix='##')






In [24]:
tokenizer.get_vocab()

{'o': 34,
 'h': 31,
 '##h': 58,
 'I': 18,
 '##T': 64,
 '-': 5,
 'G': 16,
 '##H': 55,
 '##o': 57,
 '[UNK]': 1,
 'D': 13,
 'M': 20,
 'P': 23,
 '4': 9,
 'A': 10,
 '##b': 59,
 '##3': 43,
 'B': 11,
 '##O': 50,
 'R': 24,
 'l': 32,
 '[MASK]': 4,
 '##a': 45,
 '##M': 65,
 '##n': 54,
 '##D': 60,
 'U': 27,
 '1': 6,
 '[SEP]': 3,
 '##4': 51,
 '##B': 63,
 '##F': 62,
 '##2': 56,
 '##p': 49,
 '##1': 42,
 '##P': 39,
 '##l': 46,
 '##A': 40,
 '3': 8,
 'a': 28,
 '##u': 47,
 '[PAD]': 0,
 'u': 36,
 '##N': 41,
 'O': 22,
 'H': 17,
 '##c': 48,
 '##R': 61,
 'S': 25,
 '2': 7,
 'F': 15,
 'L': 19,
 'b': 29,
 '##L': 44,
 '##C': 38,
 'T': 26,
 'C': 12,
 '##I': 52,
 '[CLS]': 2,
 '##E': 66,
 'p': 35,
 '##S': 53,
 '##U': 37,
 'E': 14,
 'n': 33,
 'N': 21,
 'c': 30}

In [20]:
all_mono

array(['A-D-FUCP', 'A-D-FUCPNAC', 'A-D-GALF', 'A-D-GALNAC', 'A-D-GALP',
       'A-D-GALPA', 'A-D-GALPA2SO3', 'A-D-GALPAN', 'A-D-GALPASO3',
       'A-D-GALPNAC', 'A-D-GLC', 'A-D-GLCP', 'A-D-GLCP1OME', 'A-D-GLCPA',
       'A-D-GLCPN', 'A-D-GLCPN1PO4', 'A-D-GLCPNAC', 'A-D-KDOP',
       'A-D-MANP', 'A-D-MANP1SET', 'A-D-MANP2AC', 'A-D-MANP2AC1SET',
       'A-D-MANP3AC', 'A-D-MANP4AC', 'A-D-MANP6AC', 'A-D-QUIP4N',
       'A-D-RHAP', 'A-D-RHAP4NAC', 'A-L-ARAF', 'A-L-ARAP', 'A-L-FUCP',
       'A-L-FUCP2SO3', 'A-L-FUCP2SO33SO34SO3', 'A-L-FUCP2SO34SO3',
       'A-L-FUCP3SO34SO3', 'A-L-FUCPNAC', 'A-L-GALP', 'A-L-GULPA',
       'A-L-RHAP', 'B-D-FUCPNAC', 'B-D-GALF', 'B-D-GALF2AC',
       'B-D-GALFOAC', 'B-D-GALP', 'B-D-GALP1OME', 'B-D-GALPA',
       'B-D-GALPNAC', 'B-D-GLC1OME', 'B-D-GLCP', 'B-D-GLCP1OME',
       'B-D-GLCP2AC', 'B-D-GLCPA', 'B-D-GLCPAN', 'B-D-GLCPN',
       'B-D-GLCPN4PO4', 'B-D-GLCPNAC', 'B-D-GLCPNAC6SO3', 'B-D-KDOP',
       'B-D-MANP', 'B-D-MANP2AC', 'B-D-MANPA1NAC3NAC', 'B-D-MA