In [2]:
"""
Create subword dictionary to breakdown words for embedding training
"""

import pickle
from collections import Counter
from tqdm import tqdm

import numpy as np
from matplotlib import pyplot as plt


In [65]:
#Load Assets
with open(r'D:\dstore\nlp\w2v\wcounts16', 'rt', encoding='utf8') as f:
    wlst = [x.strip().split() for x in f.readlines()]

with open(r'D:\dstore\nlp\w2v\prefs', 'rt', encoding='utf8') as f:
    subw = {x.strip() for x in f.readlines()}

with open(r'D:\dstore\nlp\w2v\suffs', 'rt', encoding='utf8') as f:
    for x in f.readlines():
        subw.add(x.strip())

for x in {'when_', 'been_', 'have_', 'two_', 'they_', 'who_', 'not_', 'new_', 'but_', 'her_', 'she_', 'after_', 'one_', 'their_', 'or_', 'has_', 'first_', 'had_', 'be_', 'also_', 'this_', 'are_', 'which_', 'were_', 'an_', 'it_', 'his_', 'from_', 'at_', 'that_', 'he_', 'by_', 'with_', 'as_', 'for_', 'on_', 'is_', 'was_', 'to_', 'and_', 'in_', 'of_', 'the_'}:
    subw.add(x)

#Filter out words w/ < 256 occurences 
wlst = Counter({x[1]: int(x[0]) for x in wlst if int(x[0]) > 256})

#Count letters in all words
lcount = Counter()
for x in wlst:
    for y in x:
        lcount[y] += 1

#Filter out letters w/ < 256 occurences
lcount = Counter({x[0]: x[1] for x in lcount.items() if x[1] >= 256})
alpha = [*lcount, '_', "'", '[UNK]']
wlst = {' '.join(list(x[0])): x[1] for x in wlst.items()}

#Remove all words with unknown letters
temp = []
for x in wlst:
    for y in x.split():
        if y not in alpha:
            temp.append(x)
            break
for x in temp:
    wlst.pop(x)

#Remove all words that do not have a vowel
vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'á', 'é', 'ö']

wlst = Counter({f'_{x[0]}_': x[1] for x in wlst.items() if len(x[0]) > 1 and any({y for y in vowels if y in x[0]})})
wlst = Counter({''.join(x[0].split()): x[1] for x in wlst.items()})

ngrams = Counter()
for n in range(2, 9):
    for w in wlst:
        for i in range(0, len(w) - n + 1):
            ngrams[w[i:i+n]] += 1


wlst = Counter()
for x in ngrams.most_common():
    if x[1] >= 8:
        if len(x[0]) > 1:
            wlst[x[0]] += x[1]

def tree_split(input: list[tuple[str, int]], max_depth: int, depth:int=0):
    tree = []
    while input:
        trunk = input.pop(0)
        branches = []
        for x in input[::-1]:
            if trunk[0] in x[0]:
                branches.append(x)
                input.remove(x)
        tree.append((trunk, branches[::-1]))
    depth += 1
    if max_depth != depth:
        for i, split in enumerate(tree):
            tree[i] = (split[0], tree_split(split[1], max_depth, depth))
        tree = tree
    return tree



In [3]:

commons = wlst.most_common()

for i, x in tqdm(enumerate(commons)):
    for y in commons[i+1:]:
        if x[0] in y[0]:
            wlst[x[0]] -= wlst[y[0]]
            break


54821it [01:41, 541.88it/s] 


In [66]:
affixes = Counter({x[0]: x[1] for x in wlst.most_common() if '_' in x[0] and x[1] > 16 and len(x[0]) > 2})

In [None]:
[x for x in affixes.most_common() if 's_' in x[0]]

In [67]:
asplit = tree_split(affixes.most_common(), 6)

In [13]:
np.mean([x[0][1] for x in asplit])

4236.660714285715

In [68]:
asplit

[(('ed_', 5127),
  [(('ted_', 1271),
    [(('ated_', 533),
      [(('rated_', 100), [(('erated_', 35), []), (('orated_', 18), [])]),
       (('lated_', 91), [(('ulated_', 44), [])]),
       (('nated_', 61), [(('inated_', 36), [(('minated_', 18), [])])]),
       (('cated_', 51), [(('icated_', 30), [])]),
       (('iated_', 46), []),
       (('tated_', 36), [(('itated_', 19), [])]),
       (('gated_', 30), []),
       (('eated_', 25), []),
       (('dated_', 18), []),
       (('uated_', 17), []),
       (('mated_', 17), [])]),
     (('nted_', 121),
      [(('ented_', 56), [(('mented_', 24), [])]),
       (('inted_', 26), []),
       (('unted_', 20), []),
       (('anted_', 17), [])]),
     (('sted_', 116),
      [(('ested_', 38), []), (('isted_', 23), []), (('asted_', 22), [])]),
     (('cted_', 112), [(('ected_', 60), []), (('acted_', 24), [])]),
     (('rted_', 68), [(('orted_', 28), []), (('erted_', 20), [])]),
     (('ited_', 66), []),
     (('tted_', 57), [(('itted_', 23), [])]),
  

In [56]:
pre = []
suf = []

for y in [x[0] for x in asplit]:
    if y[0][0] == '_':
        pre.append((y[0], y[1] / lcount[y[0].replace('_', '')]))
    else:
        suf.append((y[0], y[1] / lcount[y[0].replace('_', '')]))


In [57]:
suf

[('s_', 0.32148905575970604),
 ('e_', 0.15124313995600686),
 ('n_', 0.184460592509782),
 ('a_', 0.11057150669948045),
 ('d_', 0.26809184481393505),
 ('r_', 0.1090950432014552),
 ('y_', 0.4768531088786315),
 ('g_', 0.2471687786805877),
 ('t_', 0.10782437429160706),
 ('l_', 0.09612861575626315),
 ('i_', 0.06190810967288367),
 ('o_', 0.07527788292783646),
 ('m_', 0.09136845019727279),
 ('h_', 0.09567094648335417),
 ('k_', 0.13970639881997612),
 ('c_', 0.055857163594135575),
 ('u_', 0.04553221429289376),
 ('p_', 0.040223214285714286),
 ('f_', 0.05682417003337432),
 ('z_', 0.13584667228306654),
 ('w_', 0.05964170468454434),
 ('x_', 0.20748576078112285),
 ('v_', 0.04694527736131934),
 ('b_', 0.02344641560327835),
 ('é_', 0.23108108108108108),
 ('j_', 0.03794266441821248),
 ('q_', 0.046357615894039736),
 ('á_', 0.1151685393258427)]

In [58]:
pre

[('_s', 0.19317782393353572),
 ('_c', 0.2876179416461025),
 ('_m', 0.29033709420640963),
 ('_p', 0.3505357142857143),
 ('_a', 0.08579710144927537),
 ('_b', 0.40247951032264756),
 ('_r', 0.09098074882522358),
 ('_t', 0.11070439807502927),
 ('_d', 0.1879968329374505),
 ('_h', 0.1876680680225732),
 ('_g', 0.18932345387950997),
 ('_l', 0.0927757762552323),
 ('_e', 0.047548159175239406),
 ('_f', 0.3726506235728087),
 ('_k', 0.2592540563320924),
 ('_i', 0.050563114290834404),
 ('_n', 0.04931370722315384),
 ('_w', 0.33626349171024816),
 ('_o', 0.04587219105956846),
 ('_v', 0.21270614692653672),
 ('_u', 0.0676449894461755),
 ('_j', 0.4654300168634064),
 ('_y', 0.055050230790116755),
 ('_z', 0.14806234203875315),
 ('_q', 0.32691149909692957),
 ('_x', 0.06509357200976404),
 ('_é', 0.07162162162162163),
 ('_á', 0.0702247191011236)]