In [1]:

import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
from pickle import load, dump
from collections import Counter
from tqdm import tqdm
from btk import rdx_sort

plt.style.use(f"{os.environ['style']}")

with open(r'D:\dstore\nlp\w2v\fwords', 'rt') as f:
    full_words = Counter({f'_{x[1]}_': int(x[0]) for x in [x.strip().split() for x in f.readlines()]})

for x in [x for x in full_words if len(x) < 5]:
    full_words.pop(x)
for x in [x for x in full_words if "'" in x]:
    out = x.split("'")
    if f'{out[0]}_' in full_words:
        full_words[f'{out[0]}_'] += full_words[x]
    full_words.pop(x)

ldct = {
    'alpha': {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'},
    '2afx': {"_ab", "_ac", "_ad", "_af", "_ag", "_al", "_an", "_ap", "_ar", "_as", "_at", "_be", "_bi", "_by", "_co", "_de", "_di", "_ec", "_ef", "_el", "_em", "_en", "_ep", "_er", "_es", "_eu", "_ex", "_hi", "_ig", "_il", "_im", "_in", "_ir", "_ly", "_my", "_no", "_ob", "_oc", "_of", "_on", "_op", "_re", "_to", "_un", "_up", "ae_", "al_", "ar_", "by_", "cy_", "ea_", "ed_", "ee_", "el_", "en_", "er_", "es_", "et_", "ex_", "ia_", "ic_", "ie_", "in_", "is_", "la_", "ly_", "ol_", "or_", "ry_", "sa_", "sy_", "th_", "ty_", "um_", "up_", "yl_"},
    '1afx': {'s_', 'd_', 'r_', 'n_', 't_', 'y_', '_a', 'a_', '_o', 'o_', 'i_', '_e'},
    'fdbl': {'b', 'c', 'd', 'f', 'g', 'l', 'm', 'n', 'p', 'r', 's', 't'},
    'bdbl': {'b', 'd', 'g', 'm', 'l', 'n', 'p', 'r', 't'},
    'avwls': {'a', 'e', 'i', 'o', 'u', 'y'},
    'bvwls': {'a', 'e', 'i', 'o', 'u'},
    'cvwls': {'a', 'e', 'o', 'i', 'y'},
    'dvwls': {'a', 'e', 'o', 'u'},
    'fvwls': {'a', 'o', 'i', 'u'},
    'spafx': {'ity_', 'logy_', 'try_', 'sy_', 'cy_', 's_', 'y_'}
}
tpos = {"_anti", "_arc", "_auto", "_bio", "_carbo", "_chroma", "_com", "_con", "_contra", "_counter", "_dis", "_fore", "_hyper", "_hypo", "_inter", "_iso", "_kin", "_lat", "_max", "_meta", "_micro", "_mis", "_mono", "_multi", "_neuro", "_non", "_ortho", "_over", "_para", "_photo", "_poly", "_pre", "_pro", "_pseudo", "_semi", "_sin", "_snow", "_sub", "_sum", "_sup", "_super", "_sym", "_trans", "_under", "_uni", "_var", "_vert", "_wolf", "able_", "ally_", "ate_", "fish_", "form_", "graph_", "ing_", "ish_", "ism_", "ist_", "ity_", "ive_", "ize_", "less_", "logy_", "man_", "ment_", "meter_", "ness_", "ory_", "ship_", "tone_", "try_"}
lockg = {*ldct['2afx'], *ldct['1afx'], *tpos}

def get_nested(subject, stage, src):
    hln = len(subject) * 3
    hold = {x for x in src if subject in x and len(x) < hln}
    if subject.strip('_') in hold:
        hold.remove(subject.strip('_'))
    return hold

def extract_afx(subject, group, stage, src) -> list:
    if stage == 1:
        hold = []
        group = [f'_{x}_'.split(subject) for x in group]
        for x in group:
            for y in x:
                if len(y) > 2 or y in singles:
                    hold.append(y)
    return hold

def afx_count(inp: Counter, stage: int):
    hold = dict()
    nons = []
    if stage == 1:
        src = inp
    else:
        src = {x.strip('_') for x in inp}
    for subject in tqdm(inp):
        out = get_nested(subject, stage, src)
        if out:
            hold[subject] = out
        else:
            nons.append(subject)
    outp = Counter()
    for subject in hold:
        out = extract_afx(subject, hold[subject], stage, inp)
        if out:
            for x in out:
                outp[x] += 1
    return (Counter({x[0]: x[1] for x in outp.most_common() if x[1] > 2}), nons)

def search(term, corpus, exc=None):
    #Returns all items that contain the input affix 
    if not exc:
        return sorted({x for x in corpus if term in x})
    elif isinstance(exc, str):
        return sorted({x for x in corpus if term in x and exc not in x})
    else:
        return sorted({x for x in corpus if term in x and all(y not in x for y in exc)})

def gsub(target: str, afx: str, best=True, amode=0, guard=True, dbg=False):
    #Remove the affix from a word following english rules, returning the proper root
    if amode == 0:
        if len(target) - len(afx) < 4: return
    else:
        if len(target) - len(afx) < 2: return
    rep = target.replace(afx, '')
    candidates = [target.replace(afx, '_')]
    if afx[0] == '_':
        pre = True
    else:
        pre = False
    if not pre or amode in (1, 2):
        if afx in ldct['spafx']:

            if afx == 'logy_':
                candidates.append(f'{rep}a_')
                candidates.append(f'{rep}l_')
                candidates.append(f'{rep[:-1]}_')
                candidates.append(f'{rep[:-2]}_')
            elif afx == 'ity_':
                if rep.endswith('abil'):
                    candidates.append(rep.replace('abil', 'able_'))
                if rep.endswith('ibil'):
                    candidates.append(rep.replace('ibil', 'ible_'))
            elif afx == 'try_':
                candidates.append(f'{rep}t_')
            elif afx == 'cy_':
                candidates.append(f'{rep}t_')
                if rep[-1] == 'a': 
                    candidates.append(f'{rep}te_')
            elif afx == 's_':
                if rep[-1] in ['s', 'i', 'u']: return
            elif afx == 'y_':
                if rep[-1] in ldct['bvwls']: return

        if dbg: print(candidates)
        if afx[0] in ldct['bvwls']:
            if afx[0] == rep[-1]:
                return
            dreps = [rep]
            if len(rep) > 4 and rep[-1] == rep[-2] and rep[-1] in ldct['bdbl']:
                dreps.append(rep[:-1])
                candidates.append(f'{rep[:-1]}_')
            elif rep[-1] in ldct['fvwls']:
                candidates.append(f'{rep[:-1]}_')
                candidates.append(f'{rep[:-1]}e_')
                if rep[-1] == 'i':
                    candidates.append(f'{rep[:-1]}y_')
            for drep in dreps:
                candidates.append(f'{drep}e_')
                candidates.append(f'{drep}a_')
                candidates.append(f'{drep}y_')
                if afx[0] == 'e':
                    if drep[-1] == 'v':
                        candidates.append(f'{drep[:-1]}f_')
                    if drep[-1] == 'm':
                        candidates.append(f'{drep[:-1]}_')
                elif afx[0] == 'i':
                    if drep[-1] == 't':
                        candidates.append(f'{drep[:-2]}e_')
                        candidates.append(f'{drep[:-2]}_')
                        if drep[-2] == 'i':
                            candidates.append(f'{drep[:-1]}sh_')
                        elif drep.endswith('ipt'):
                            candidates.append(f'{drep[:-2]}be_')
                        elif drep.endswith('orpt'):
                            candidates.append(f'{drep[:-2]}b_')
                    elif drep[-1] == 's':
                        candidates.append(f'{drep[:-1]}e_')
                        if drep[-2] in ldct['avwls']:
                            candidates.append(f'{drep[:-1]}de_')
                            candidates.append(f'{drep[:-1]}re_')
                        elif drep[-2] == 's' and len(drep) > 2:
                            if drep[-3] in ldct['dvwls']:
                                candidates.append(f'{drep[:-2]}de_')
                            elif drep[-3] == 'i':
                                candidates.append(f'{drep[:-2]}t_')
                        elif drep[-2] == 'r':
                            candidates.append(f'{drep[:-1]}t_')
                        elif drep[-2] == 'n':
                            candidates.append(f'{drep[:-1]}d_')
                elif afx[0] == 'a':
                    if drep.endswith('ti'):
                        candidates.append(f'{drep[:-2]}ce_')

    if dbg: print(candidates)
    if amode == 0: 
        out = sorted([(x, full_words[x]) for x in candidates if (x in full_words and full_words[x] > 4)], key=lambda x: x[1])
    elif amode == 1:
        out = sorted([(x, tf2[x]) for x in candidates if x in tf2], key=lambda x: x[1])
    else:
        out = []
        if pre:
            for x in candidates:
                mafx = f'{x[1:]}_'
                full = f'{x}_'
                if mafx in tf2 and tf2[mafx] > 8:
                    out.append((mafx, tf2[mafx]))
                elif full in full_words and full_words[full] > 256:
                    out.append((full, np.log2(full_words[full])))
        else:
            for x in candidates:
                mafx = f'_{x[:-1]}'
                full = f'_{x}'
                if mafx in tf2 and tf2[mafx] > 8:
                    out.append((mafx, tf2[mafx]))
                elif full in full_words and full_words[full] > 256:
                    out.append((full, np.log2(full_words[full])))
        out = sorted(out, key=lambda x: x[1])
    if out:
        if best: return out[-1][0]
        else: return out

def target_removal(afx, exc1=None, exc2=None, exe=False, dbg=False):
    #Find all affixes that contain the input affix and attempt to sub the affix. Only works if the replacement is in the affix list
    if exc1 and exc2:
        if isinstance(exc1, str) and isinstance(exc2, str):
            targets = [x for x in tf2 if afx in x and x not in (afx, exc1) and exc2 not in x]
        elif isinstance(exc1, str):
            targets = [x for x in tf2 if afx in x and x not in (afx, exc1) and all(y not in x for y in exc2)]
        elif isinstance(exc2, str):
            targets = [x for x in tf2 if afx in x and x not in (afx, *exc1) and exc2 not in x]
        else:
            targets = [x for x in tf2 if afx in x and x not in (afx, *exc1) and all(y not in x for y in exc2)]
    elif exc1:
        if isinstance(exc1, str):
            targets = [x for x in tf2 if afx in x and x not in (afx, exc1)]
        else:
            targets = [x for x in tf2 if afx in x and x not in (afx, *exc1)]
    elif exc2:
        if isinstance(exc2, str):
            targets = [x for x in tf2 if afx in x and x != afx and exc2 not in x]
        else:
            targets = [x for x in tf2 if afx in x and x != afx and all(y not in x for y in exc2)]
    else:
        targets = [x for x in tf2 if afx in x and x != afx]
    rem = []
    if dbg: print(targets)
    for x in targets:
        tmp = gsub(x, afx, amode=1)
        if tmp: rem.append((x, tmp))
    if exe:
        for x in rem:
            tf2[x[1]] += tf2[x[0]]
            tf2.pop(x[0])
    else: return rem

def pulld(afx, len_lim=False):
    #Return all child nodes of the input affix
    aln = len(afx)
    sub_set = [x for x in tf2 if afx in x]
    out = []
    for x in sub_set:
        i = 1
        if x[0] == '_':
            while len(x[:-i]) > aln:
                if x[:-i] in sub_set:
                    break
                i += 1
            else:
                out.append(x)
        else:
            while len(x[i:]) > aln:
                if x[i:] in sub_set:
                    break
                i += 1
            else:
                out.append(x)
    if not len_lim: return [x for x in out if x != afx]
    else: return [x for x in out if len(x) == len(afx)+1 and x != afx]

def pullu(afx):
    #Return the parent node of the input affix
    i = 1
    if afx[0] == '_':
        while i < len(afx):
            if afx[:-i] in tf2:
                return afx[:-i]
            i += 1
    else:
        while i < len(afx):
            if afx[i:] in tf2:
                return afx[i:]
            i += 1

def chain(afx):
    #Return the longest affix chain that contains the input affix
    out = sorted([x for x in tf2 if afx in x or x in afx], key=lambda x: len(x))[-1]
    return sorted([x for x in tf2 if x in out], key=lambda x: len(x), reverse=True)

def pulld_relent(afx, depth=1):
    #Grabs all that branch from input affix and their relative entropies.
    #Returns the mean of those relative entropies
    hold = [afx]
    while depth > 0:
        grp = []
        while hold:
            tmp = pulld(hold.pop(), True)
            if tmp:
                for y in tmp:
                    grp.append(y)
        hold.extend(grp)
        depth -= 1
    if hold:
        hold = np.mean([drntp[x] for x in hold], axis=0)
        return np.array([*[np.mean(x) for x in hold], *[np.mean(x) for x in hold.T]])
    else: return np.array([0]*6)

def remean(rearr):
    return np.array([*[np.mean(y) for y in rearr], *[np.mean(y) for y in rearr.T]])

def relent_peaks(target, bridge_coeff=1, dbg=False):
    #Takes an affix and all affixes within its tree and compares their relative entropies.
    #Returns affixes with significant peaks in relative entropies
    scores = erw.copy()
    words = chain(target)

    for x in words: scores.append(rntp[x] * wgts)
    if target[0] == '_': scores.append(frw)
    else: scores.append(brw)
    hold = zdre.copy()
    for i in range(1, len(scores)-1): hold.append((scores[i+1]-scores[i])+(scores[i-1]-scores[i]))
    if target[0] == '_': hold.extend([dfrw, dfrw])
    else: hold.extend([dbrw, dbrw])

    out = []
    for i in range(2, len(hold)-2):
        u1, d1, md = hold[i-1].copy(), hold[i+1].copy(), hold[i]
        if dbg: print(words[i-2], (md-u1).mean(), (md-d1).mean())
        if (md-u1).mean() > bridge_coeff or (md-d1).mean() > bridge_coeff:
            u2, d2 = hold[i-2].copy(), hold[i+2].copy()
            u2[u2 > u1] *= 0
            u1[u1 > hold[i-2]] *= 0
            d2[d2 > d1] *= 0
            d1[d1 > hold[i+2]] *= 0
            u1 = u1 + u2
            d1 = d1 + d2
        out.append((md-u1)+(md-d1))

    if dbg:
        for i, x in enumerate(out): print(words[i], '\n', remean(x).mean(), '\n', x)
    return [(words[i], remean(x).mean()) for i, x in enumerate(out)]

with open(r'D:\dstore\tmp\4', 'rb') as f:
    dsts = load(f)
with open(r'D:\dstore\tmp\5', 'rb') as f:
    tf2, rntp, drntp = load(f)

wgts = np.array([[1, 1, 1.25], [1, 1.25, 1.5], [1.25, 1.5, 1.75]])
dsc = {x[0]: np.array([*[np.mean(y) for y in x[1]*wgts], *[np.mean(y) for y in x[1].T*wgts]]) for x in drntp.items()}
frw, brw, erw = dsts['fr']*wgts, dsts['br']*wgts, [dsts['lr']*wgts]
dfrw, dbrw = [], []
for x in [x for x in tf2 if len(x) == 2 and x[0] == '_']: dfrw.append(rntp[x] - frw)
for x in [x for x in tf2 if len(x) == 2 and x[-1] == '_']: dbrw.append(rntp[x] - brw)
dfrw, dbrw = np.mean(dfrw, axis=0)*wgts, np.mean(dbrw, axis=0)*wgts
zdre = [np.array([[0]*3]*3), np.array([[0]*3]*3)]


In [10]:
w = 'transferring'
print(full_words[f'_{w}_'])
for i in range(len(w)+1):
    tally = 0
    count = 0
    for y in full_words:
        if w[:i] in y:
            tally += full_words[y]
        if w[i:] in y:
            count += full_words[y]
    print(w[:i], tally/19617, count/19617)



19617
 72520.06479074272 1.0
t 32784.12723658052 1.0
tr 1571.1557322730284 1.0
tra 555.9628893306825 1.0
tran 109.53774787174389 1.0
trans 94.27109140031605 4.128918794922771
transf 26.666513738084316 4.68659835856655
transfe 18.73594331447214 16.879033491359536
transfer 18.701534383442933 368.02217464444107
transferr 10.850384870265586 2941.6623846663606
transferri 1.020849263393995 3590.2202681347812
transferrin 1.020849263393995 7973.263292042616
transferring 1.0 72520.06479074272


In [2]:

def tumbler(oafx):

    if oafx[0] == '_': pre = True
    else: pre = False
    cnt = Counter()
    afx = oafx.strip('_')
    full_group = [x for x in tf2 if afx in x]
    prefixes = [x.split(afx) for x in full_group if x[0] == '_']
    suffixes = [x.split(afx) for x in full_group if x[-1] == '_']
    if pre: 
        pre_pf = [x[0] for x in prefixes if x[0] and x[0] != '_' and len(x[0]) > 2 and x[0] in tf2 and tf2[x[0]] > 4]
        suf_sf = [x[1] for x in suffixes if x[1] and x[1] != '_' and len(x[1]) > 4 and x[1] in tf2 and tf2[x[1]] > 4]
        if len(oafx) < 4:
            pre_sf = [f'_{x[1]}' for x in prefixes if len(x[1]) > 3 and f'_{x[1]}' in tf2]
        else:
            pre_sf = [y for y in [gsub(f'_{x[1]}', oafx, amode=True) for x in prefixes if x[1] and len(x[1]) > 3] if y and y in tf2]
        for x in [*pre_pf, *pre_sf, *suf_sf]:
            if x[0] == '_' and x[-1] == '_':
                cnt[x] += int(full_words[x] ** (1/np.e))
            else:
                cnt[x] += tf2[x]
        totals = (len(pre_pf), len(pre_sf), len(suf_sf), len(prefixes), len(suffixes), cnt.total(), len([x for x in cnt if x[0] == '_' and x[-1] == '_']))
        return totals, cnt
    else: 
        if len(oafx) < 4:
            pre_pf = [x[0] for x in prefixes if x[0] and x[0] != '_' and len(x[0]) > 4 and x[0] in tf2]
            suf_pf = [f'{x[0]}_' for x in suffixes if len(x[0]) > 2 and f'{x[0]}_' in tf2]
        else:
            pre_pf = [y for y in [gsub(x[0], oafx, amode=2) for x in prefixes if x[0] and x[0] != '_' and len(x[0]) > 4] if y]
            suf_pf = [y for y in [gsub(f'_{x[0]}', oafx, amode=1) for x in suffixes if x[0] and len(x[0]) > 2] if y]
        suf_sf = [x[1] for x in suffixes if x[1] and x[1] != '_' and len(x[1]) > 3 and x[1] in tf2 and tf2[x[1]] > 4]
        for x in [*pre_pf, *suf_pf, *suf_sf]:
            if x[0] == '_' and x[-1] == '_':
                cnt[x] += int(full_words[x] ** (1/np.e))
            else:
                cnt[x] += tf2[x]
        totals = (len(pre_pf), len(suf_pf), len(suf_sf), len(prefixes), len(suffixes), cnt.total(), len([x for x in cnt if x[0] == '_' and x[-1] == '_']))
        return totals, cnt

def grade(afx, grade=False, dbg=False):
    # # of words with afx, # of words matched sans affix #of frags matched by removing affix from word
    matches = search(afx, full_words)
    reps = [x.replace(afx, '_') for x in matches if x.replace(afx, '_') in full_words]
    rep_frags = [x.replace(afx, '') for x in matches if x.replace(afx, '') in tf2]
    vals = [full_words[x] for x in reps]
    frag_vals = [tf2[x] for x in rep_frags]
    o1, o2, o3 = len(matches), len(reps), len(rep_frags)
    if o2 > 0: o4 = np.mean(vals) + np.median(vals)/2
    else: o4 = 1
    if o3 > 0: o5 = np.mean(frag_vals) + np.median(frag_vals)/2
    else: o5 = 1
    if dbg:
        for x in [x for x in zip(reps, vals)]: print(x)
        for x in [x for x in zip(rep_frags, frag_vals)]: print(x)
    if grade:
        if o1 > 0 and o2 > 0: p1 = np.log2(o2/o1*o4) ** 2
        else: p1 = 1
        if o3 > 0 and o4 > 0: p2 = o3/o1*o5
        else: p2 = 1
        return round(p1 * p2, 4)
    else: return o1, o2, o3, round(o4, 4), round(o5, 4)


In [None]:

for x in [x for x in tf2 if len(x) > 9]:
    tf2.pop(x)
for x in ('less_', 'ness_'): target_removal(x, exe=True)
target_removal('es_', exc1=('es_', 's_'), exc2=('is_', 'us_', 'ss_', 'series_', 'species_'), exe=True)
target_removal('s_', exc1=('es_', 's_'), exc2=('is_', 'us_', 'ss_', 'series_', 'species_'), exe=True)
target_removal('er_', exc2=('meter_', 'over_', 'under_', 'master_'), exe=True)
target_removal('or_', exc2=('oor_'), exe=True)
target_removal('ed_', exc2=('eed_'), exe=True)
for x in ('en_', 'ly_', 'ion_', 'ous', 'ing_', 'ity_', 'ize_', 'ise_', 'ive_', 'ist_', 'ism_', 'ory_', 'est_', 'ment_', 'ant_', 'ary_', 'ate_', 'ic_', 'al_'):
    target_removal(x, exe=True)
target_removal('y_', exc2=('ity_', 'ry_', 'ly_', 'ory_', 'ary_'), exe=True)
for x in [x[0] for x in tf2.most_common() if len(x[0]) < 5 and x[1] < 8]:
    tf2.pop(x)
for x in [x for x in tf2 if len(x) < 4 and x not in lockg]:
    tf2.pop(x)

ends = []
for x in tf2:
    if not pulld(x):
        ends.append(x)
scores = {x: [] for x in tf2}
for x in ends:
    for y in relent_peaks(x):
        scores[y[0]].append(y[1])
t = {x[0]: (np.mean(x[1]), np.median(x[1]), len([y for y in x[1] if y > 0.5]), len(x[1])) for x in scores.items()}
for x in [x for x in t.items() if x[1][0] < -1.5 and x[1][2] / x[1][3] < 0.2 and x[0] not in lockg]:
    tf2.pop(x[0])


In [10]:
sf2 = [*rdx_sort([x for x in tf2 if x[0] == '_']), *rdx_sort([x for x in tf2 if x[-1] == '_'])]
fwords = rdx_sort(list(full_words))[::-1]
rf2 = sorted(list({x.strip('_') for x in sf2 if len(x) > 2}))

In [11]:
scanmg = np.array([[255 if y in x else 0 for y in rf2] for x in fwords], dtype=np.uint8)

In [None]:
Image.fromarray(scanmg)

In [6]:
rf2[18385]

'on'

In [None]:
fwords

In [91]:
rms = []
for x in sf2:
    out = grade(x)
    if out[3] == 1 and out[4] == 1:
        rms.append(x)

In [174]:
def double_rep(afx):
    u, v = 0, 0
    group = search(afx, full_words)
    for x in group:
        x = x.replace(afx, '')
        if len(x) > 3:
            sub_group = [y for y in tf2 if x in y]
            print(sub_group)
            for y in sub_group:
                u += 1
                v += tf2[y]
    print(afx, u, v)

In [None]:

afx = 'ive_'
words = chain(afx)
fig, ax = plt.subplots(figsize=(16, 10))

yvars = [[*remean((rntp[w]*wgts)), *remean((drntp[w]*wgts))] for w in words]
if afx[0] == '_': yvars.append([*remean((frw)), *remean((dfrw))])
else: yvars.append([*remean((brw)), *remean((dbrw))])
words.append('_')
plt.xticks(range(len(yvars)), words)
ax.plot(range(len(yvars)), yvars)
ax.legend(['bidir', 'dir', 'xdir', 'win3', 'win2', 'win1', 'dbidir', 'ddir', 'dxdir', 'dwin3', 'dwin2', 'dwin1'])
