In [1]:
"""
Create subword dictionary to breakdown words for embedding training
"""
import numpy as np
import pickle
from collections import Counter


In [2]:
#Load Assets

with open(r'D:\dstore\nlp\w2v\wcounts', 'rt', encoding='utf8') as f:
    wlst = [x.strip().split() for x in f.readlines()]

#Filter out words w/ < 256 occurences 
wlst = Counter({x[1]: int(x[0]) for x in wlst})

#Count letters in all words
lcount = Counter()
for x in wlst:
    for y in x:
        lcount[y] += 1

#Filter out letters w/ < 256 occurences
lcount = {x[0]: x[1] for x in lcount.items() if x[1] >= 256}
alpha = [*lcount, '_', "'", '[UNK]']
wlst = Counter({f"_{' '.join(list(x[0]))}_": x[1] for x in wlst.items()})

stubs = {f'{x[:-3]}_' for x in wlst}
full_words = [''.join(x[0].split()) for x in wlst.most_common() if x[1] > 1000000 and x[0] not in ['_u s e d_', '_u n i t e d_', '_i n c l u d i n g_', '_r e l e a s e d_', '_c a l l e d_', '_f o l l o w i n g_', '_b a s e d_', '_p l a y e d_', '_l o c a t e d_']]
full_words.extend(['_i_', "_i'm_", '_a_'])
wlst = Counter({x[0]: x[1] for x in wlst.items() if ''.join(x[0].split()) not in full_words})


In [3]:
#Precombine common suffix subwords first to precondition byte encoding
primes = set()

with open(r'D:\dstore\nlp\w2v\prefs2', 'rt', encoding='utf8') as f:
    for x in f.readlines():
        primes.add(x.strip())

with open(r'D:\dstore\nlp\w2v\suffs2', 'rt', encoding='utf8') as f:
    for x in f.readlines():
        primes.add(x.strip())

isubwords = [('s_', 'es_'), ('r_', 'er_', 'or_'), ('d_', 'ed_')]

primes = sorted(primes, key=lambda x: len(x), reverse=True)
mods = []
for word in wlst:
    temp = []
    for subw in primes:
        esubw = ' '.join(list(subw)).replace(' _', '_').replace('_ ', '_')
        if esubw[0] == '_':
            rep = word.replace(f'{esubw} ', '_')
            if esubw in word and rep != word and rep in wlst:
                temp.append(esubw)
        elif esubw[-1] == '_':
            rep = word.replace(f' {esubw}', '_')
            if esubw in word and rep != word and (rep in wlst or rep in stubs):
                temp.append(esubw)
    for group in isubwords:
        for subw in group:
            esubw = ' '.join(list(subw)).replace(' _', '_')
            rep = word.replace(f' {esubw}', '_')
            if esubw in word and rep != word and rep in wlst and esubw not in temp:
                temp.append(esubw)
                break
    if len(temp) > 0:
        for x in temp[::-1]:
            for y in temp:
                if x != y and x in y:
                    temp.remove(x)
                    break
        mods.append((word, temp))

for x in mods:
    new_word = x[0]
    for y in x[1]:
        merged = ''.join(y.split())
        new_word = new_word.replace(y, merged)
    if new_word != x[0]:
        wlst[new_word] += wlst[x[0]]
        wlst.pop(x[0])

subwords = set()

with open(r'D:\dstore\nlp\w2v\prefs', 'rt', encoding='utf8') as f:
    for x in f.readlines():
        subwords.add(x.strip())

with open(r'D:\dstore\nlp\w2v\suffs', 'rt', encoding='utf8') as f:
    for x in f.readlines():
        subwords.add(x.strip())

subwords = sorted(subwords, key=lambda x: len(x), reverse=True)
mods = []
for word in wlst:
    temp = []
    for subw in subwords:
        esubw = ' '.join(list(subw)).replace(' _', '_').replace('_ ', '_')
        if esubw[0] == '_':
            rep = word.replace(f'{esubw} ', '_')
            if esubw in word and rep != word and rep in wlst:
                temp.append(esubw)
        elif esubw[-1] == '_':
            rep = word.replace(f' {esubw}', '_')
            if esubw in word and rep != word and (rep in wlst or rep in stubs):
                temp.append(esubw)
    if len(temp) > 0:
        for x in temp[::-1]:
            for y in temp:
                if x != y and x in y:
                    temp.remove(x)
                    break
        mods.append((word, temp))

for x in mods:
    new_word = x[0]
    for y in x[1]:
        merged = ''.join(y.split())
        new_word = new_word.replace(y, merged)
    if new_word != x[0]:
        wlst[new_word] += wlst[x[0]]
        wlst.pop(x[0])

subwords.extend(primes)

#Remove fully merged words from word list
temp = []
for x in wlst:
    if ' ' not in x:
        temp.append(x)
for x in temp:
    full_words.append(x)
    wlst.pop(x)

subwords = set(subwords)

del temp, lcount, mods, isubwords


In [4]:

#Counts pairs of subwords
candidate_frags = Counter()
for word in wlst:
    word = word.split()
    for i, _ in enumerate(word[:-1]):
        candidate_frags[f'{word[i]} {word[i+1]}'] += wlst[' '.join(word)]

candidate_frags = Counter({x[0]: x[1] for x in candidate_frags.items()})

#Byte encoding algorithm
#Processes the 16 most common pairs, counts pairs repeats
#Check if subword occurs for any word in word_list.
#If subword pair is found, merge pair and replace the pair with the combined subword

while len(subwords) < 20000 or len(wlst) > 0:
    count = 0
    while count < 16:
        sub = candidate_frags.most_common()[0][0]
        matches = []
        csub = sub.split()
        for word in wlst:
            if sub not in word:
                continue
            if f'{sub} ' != word[:len(sub) + 1] and f' {sub}' != word[len(word) - len(sub) - 1:] and f' {sub} ' not in word and len(sub) != len(word):
                continue
            word = word.split()
            is_match = False
            idx = []
            outp = word.copy()
            for i, y in enumerate(word[:-1]):
                if is_match:
                    is_match = False
                    continue
                if csub[0] == y and csub[1] == word[i+1]:
                    idx.append(i)
                    is_match = True
            for i, pos in enumerate(idx):
                outp.insert(pos + i, ''.join(csub))
            for i, pos in enumerate(idx):
                outp.pop(pos + 1 - i)
                outp.pop(pos + 1 - i)
            matches.append((' '.join(word), ' '.join(outp)))
        for m in matches:
            wlst[m[1]] += wlst[m[0]]
            wlst.pop(m[0])
        candidate_frags.pop(sub)
        sub = sub.replace(' ', '')
        if sub[0] == '_' and sub[-1] == '_':
            full_words.append(sub)
        else:
            subwords.add(sub)
        count += 1

    #Remove fully combined words from list
    for x in list(wlst):
        if ' ' not in x:
            wlst.pop(x)
            full_words.append(x)

    #Count pairs
    candidate_frags = Counter()
    for word in wlst:
        word = word.split()
        for i, _ in enumerate(word[:-1]):
            candidate_frags[f'{word[i]} {word[i+1]}'] += wlst[' '.join(word)]
    candidate_frags = Counter({x[0]: x[1] for x in candidate_frags.items()})


In [7]:
for x in candidate_frags.most_common()[:10]: print(x)
print(len(candidate_frags), '\n')
for x in wlst.most_common()[:10]: print(x)
print(len(wlst), '\n')

print(len(subwords))
print(len(full_words))

('_eisted df', 2355)
('_fluctu ating_', 2354)
('_iso lates_', 2354)
('_pedi at', 2354)
('_guan aj', 2354)
('aj u', 2354)
('u ato_', 2354)
('_aneurys m_', 2354)
('_facsim ile_', 2354)
('_solom ons_', 2354)
140958 

('_fluctu ating_', 2354)
('_iso lates_', 2354)
('_guan aj u ato_', 2354)
('_aneurys m_', 2354)
('_facsim ile_', 2354)
('_solom ons_', 2354)
('_lor en_', 2353)
('_fro ze_', 2353)
('_roman ization_', 2353)
('_south gate_', 2353)
119910 

20002
61576


In [13]:
with open(f'D:/dstore/nlp/w2v/subw-3', 'rb') as f:
    subwords = pickle.load(f)

In [40]:

version = 'x5'

with open(f'D:/dstore/nlp/w2v/wlst-{version}', 'wb') as f:
    pickle.dump(wlst, f)

with open(f'D:/dstore/nlp/w2v/subw-{version}', 'wb') as f:
    pickle.dump(subwords, f)


" with open(f'D:/dstore/nlp/w2v/subw-{version}', 'wb') as f:\n    pickle.dump(subwords, f) "

In [2]:
version = 'x5'
with open(f'D:/dstore/nlp/w2v/wlst-{version}', 'rb') as f:
    wlst = pickle.load(f)

""" with open(f'D:/dstore/nlp/w2v/subw-{version}', 'rb') as f:
    subwords = pickle.load(f) """

with open(f'D:/dstore/nlp/w2v/pwlst', 'rb') as f:
    subwords = pickle.load(f)

with open(f'D:/dstore/nlp/w2v/cwlst', 'rb') as f:
    cwords = pickle.load(f)


In [107]:
subcounter = Counter()
for x in subwords:
    for y in twords:
        if x in y:
            subcounter[x] += 1

tcounter = Counter({x[0]: x[1] for x in subcounter.most_common() if (len(x[0]) > 2 or '_' in x[0]) and len(x[0]) < 8})
test = tcounter.most_common()
tlen = len(test)

for i, x in enumerate(test):
    for j in range(1, tlen-i):
        if x[0] in test[i+j][0]:
            tcounter[x[0]] -= tcounter[test[i+j][0]]
            break

test = tcounter.most_common()
tlen = len(test)

for i, x in enumerate(test):
    j = 1
    avg_group = []
    limiter = 0
    ulim = 0
    while len(avg_group) < 8:
        if i+j < tlen:
            if len(test[i+j][0]) == len(x[0]):
                avg_group.append(test[i+j][1])
        if i-j >= 0 and ulim < 3:
            if len(test[i-j][0]) == len(x[0]):
                avg_group.append(test[i-j][1])
                ulim += 1
        j += 1
        limiter += 1
        if limiter > 4096:
            break
    if len(avg_group) > 0:
        tcounter[x[0]] -= int((np.mean(avg_group) + np.median(avg_group)) / 2)


In [109]:
test = [x for x in tcounter.most_common() if x[1] > 0]
test.pop(0)

with open(f'D:/dstore/nlp/w2v/pwlst', 'wb') as f:
    pickle.dump(test, f)