In [None]:
import re, collections

def get_vocab(filename):
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fhand:
        for line in fhand:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_tokens(vocab):
    tokens = collections.defaultdict(int)
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens[token] += freq
    return tokens

# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}

# Get free book from Gutenberg
# wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt
vocab = {f"{' '.join(list(x[1]))} </w>": int(x[0]) for x in wlst}

print('==========')
print('Tokens Before BPE')
tokens = get_tokens(vocab)
print('Tokens: {}'.format(tokens))
print('Number of tokens: {}'.format(len(tokens)))
print('==========')

num_merges = 1000
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens = get_tokens(vocab)
    print('Tokens: {}'.format(tokens))
    print('Number of tokens: {}'.format(len(tokens)))
    print('==========')

In [1]:
with open(r'D:\dstore\nlp\w2v\fwords', 'rt') as f:
    wlst = [x.strip().split() for x in f.readlines()]

In [13]:
'testing'.split(' ')

['testing']

In [18]:
v = collections.Counter(vocab)

In [21]:
collections.Counter(tokens).most_common()

[('the</w>', 155902073),
 ('of</w>', 72665387),
 ('in</w>', 67521160),
 ('and</w>', 65793940),
 ('a</w>', 52577563),
 ('to</w>', 46454359),
 ('was</w>', 27131257),
 ('ed</w>', 26574922),
 ('on</w>', 24658698),
 ('re', 22220826),
 ('t', 22060537),
 ('d', 22007342),
 ('s</w>', 21513073),
 ('s', 21154237),
 ('is</w>', 20824936),
 ('in', 19916958),
 ('as</w>', 19810614),
 ('c', 19433426),
 ('f', 19126817),
 ('ing</w>', 18443146),
 ('for</w>', 18415519),
 ('b', 18410330),
 ('es</w>', 18005377),
 ('e', 17725402),
 ('g', 17029905),
 ('p', 16496489),
 ('t</w>', 16329383),
 ('by</w>', 16315860),
 ('with</w>', 16036654),
 ('m', 16002361),
 ('y</w>', 15802237),
 ('i', 15778721),
 ('st', 15601464),
 ('at</w>', 15041681),
 ('er</w>', 15033889),
 ('al</w>', 14770495),
 ('he</w>', 14186619),
 ('u', 13361391),
 ('h', 13259253),
 ('e</w>', 13212898),
 ('that</w>', 13180852),
 ('n', 13067046),
 ('a', 12903474),
 ('an</w>', 12893643),
 ('it</w>', 12613560),
 ('ation</w>', 12400112),
 ('en', 12393450),
 (