In [28]:
"""
Problem:
    Algorithm used for extracting subwords did not feel satisfactory.

Goal:
    Reduce the amount of words in the dictionary by decomposing english into the components (affixes) that give meaning to a word
    Target affixes that are most frequently reused and carry meaning consistently
    Final product is a dictionary that will be able to break down any word into its components while maintaining information integrity

Terms
Affix: 
    A word or fragment of a word that carries meaning.
        - Words can be composed entirely of affixes, or out of affixes and a root, or exist only as a root
        - An affix must carry consistent meaning
        - Denoted as a prefix by a underscore on the left or as a suffix by a underscore on the right '_pre' | 'ing_'
Parsing: 
        finding all cases of an affix, counting them and extracting from their root word appropriately, then returning the word to the word list
    OR: finding a subset of words and identifying and counting all affixes before removing the word from the word list

Subgoals:
    Create a list of affixes that compose the english language from a list of english words
    Retain as much information as possible while transitioning the word list to the affix list
    Nested affixes in the affix list should be seperated
    The final affix list should be able to compose most words in english
    Words should not be broken down into affixes if it destroys the meaning of the word
    Words should remain as whole words regardless of length if it cannot be broken down
    The algorithm should accomplish its goal with very little manual intervention

Procedures:
    1) Isolate commonly used words that are affixes to manually parse. Words for orientation are very common (s_ ing_ ed_ est_ er_ | _up _down _over _near _side _under)
    2) Create rules for affix extraction so that the remaining word will be in the most commonly found state (extract ing_ from _writing_ should yield _write_ not _writ_)
    3) Manually parse all contractions (words with ' (I'll / it's)) and remove all remaining words with apostrophes
    4) Create fragments of words by sliding windows of size 2..9 over a word (observing a word fragment) and adding the word count to the tally for that fragment
    5) Parse occuring more than 3000k times are moved to end dictionary
    6) Parse occuring more than 100k times with 4 or less letters are moved to end dictionary
    7) Parse with 3 or less letters are moved to end dictionary
    8) Create a subset of single character affixes were manually identified but keep them in the affix list
    9) Remove affixes that have no vowels (other than the single letter affixes) as the do not carry meaning
    10) Filter out non-affix fragments (no _ indicator) that are length 2 or less
    11) FIlter out non-affix fragments that do not have an affixed version (mip never occurs at the beginning or end of a word in the vocab and is not likely to be an affix)
    12) Filter out non-affix fragments when the sum of their affixed versions occur far more often frequently
    13) Apply affix counting algorithm on the affix list to amplify signal of true affixes
    14) Iterate through fragments, filtering for fragments that only appear in 2 or less words, and remove from the affix list
    15) SVD:
            Create a matrix where each column represents an affix and each row represents a word.
            Each element will be binary. 1 indicating that a affix is present in a word, 0 indicating not present

Problems:
    Identifying the appropriate cutoff point for an affix series 's_', 'es_', 'ities_' all carry meaning and occur uniquely but 'ties_' does not
        Cannot cut off all branches after finding a fragment that carries no information
    Counts are not reliable in all cases. Affixes that carry meaning could occur infrequently but consistently.
    False extractions occur 't_' is an affix for _burnt_ but not for _beat_. How to discriminate?
    False extractions can stop the proper extraction from occuring. extracting s_ _from viruses_ leaves _viruse_ which means es_ cannot be extracted
    Nested ruled affixes. 'ities_' is two affixes. 'ies_' and 'ity_'
    Uncertainty. is '_a' or '_ab' the affix for '_abbreviation_'
    Count uncertainty. 'ng_' occurs more than 'ing_' but 'ing_' extracts leaving a coherent root word

Tasks
Eliminate nested affixes
Ruled replacement
Affix Tree

"""

import numpy as np
import matplotlib.pyplot as plt
import os
from pickle import load, dump
from collections import Counter
from tqdm import tqdm
from typing import Iterable, Container, Protocol
from btk import cprint, fzip, rdx_sort, hprint, lrsort, rrsort

plt.style.use(f"{os.environ['style']}")

def setup():

    with open(r'D:\dstore\nlp\w2v\fwords', 'rt') as f:
        a = AffixAnalyzer([x.strip().split() for x in f.readlines()], 3)
    
    for x in 'id ax ox ab op ex by on to in'.split():
        a.wlst[f'_{x}_'] = a.cleared[f'_{x}_'] 
    """ for x in [x for x in a.cleared.most_common() if len(x[0]) > 3 and x[1] < 20500000 and x[1] > 100000]: a.wlst[x[0]] = x[1]
    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\manu{int(2)}', 'rb') as f:
        wrd_q, a.wlst, a.roots, a.afxscore, a.wparts, a.failed_brk = load(f) """

    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\awork4', 'rb') as f:
        roots, nfx = load(f)

    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\nroots', 'rt') as f:
        roots = [x.strip() for x in f.readlines()]

    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\rmls', 'rt') as f:
        #sorted([(x, a.full_scores[f'_{x}_']) for x in rmls if f'_{x}_' in a.full_scores], key=lambda x: x[1])[::-1]
        rmls = [x.strip() for x in f.readlines()]
        for x in rmls:
            if f'_{x}_' in a.wlst: a.wlst.pop(f'_{x}_')

    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\als', 'rt') as f:
        als = [f'_{x.strip()}_' for x in f.readlines()]
        for x in als:
            if x not in a.wlst: a.wlst[x] = 100
            else: print(x, a.wlst[x])
    
    return a, roots, nfx

def merger(results):
    r1o, r2o, c = [], [], 0
    results = tuple(sorted([(x[0], x[1]) for x in results], key=lambda x: len(x[1]), reverse=True))
    while c < len(results[0][1]):
        r1, r2 = [], []
        for x in results:
            if c < len(x[1]):
                if x[1][c] not in r2: r2.append(x[1][c])
            if c < len(x[1])-1:
                if x[0][c] not in r1: r1.append(x[0][c])
        if r2:
            if len(r2) > 1: r2o.append(tuple(r2))
            else:  r2o.append(r2[0])
        if r1:
            if len(r1) > 1: r1o.append(tuple(r1))
            else: r1o.append(r1[0])
        c += 1
    return (r1o[::-1], r2o[::-1])

def packer(results, idx, cid=None):
    while cid:
        if cid[0] == '1':
            if len(cid) < 2 or int(cid[1]) >= len(results): return False
            results = results[int(cid[1])]
            cid = cid[2:]
            if cid: cid = f'2{cid}'
        elif cid[0] == '2':
            if len(cid) < 2 or int(cid[1]) > len(idx): return False
            idx = idx[int(cid[1]):]
            cid = cid[2:]
            if cid and isinstance(idx[0], tuple) and cid[0] in ('0', '1'):
                idx[0] = idx[0][int(cid[0])]
                cid = ''
        else: cid = ''
    out = {}
    if isinstance(idx[0], tuple):
        print('roots must be a word not a tuple of words')
        return False
    else:
        results = [(x[0][:len(idx)-1], x[1][:len(idx)]) for x in results if idx[0] in x[1]]
        for x in results:
            for j, y in enumerate(x[1][:-1]):
                if y not in out:
                    out[y] = [idx[0], []]
                    for z in x[0][j:]:
                        out[y][1].append(z)
    return out

def usk_rep(wlst):
    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\ukdct', 'rb') as f:
        ouw, aew, rew, oew = load(f)
    wid = [x for x in wlst]
    for x in wid:
        if any(y in x for y in ouw) and x.replace('ou', 'o') in wlst:
            wlst[x.replace('ou', 'o')] += wlst[x]
            wlst.pop(x)
        """ if any(y in x for y in aew) and x.replace('ae', 'e') in wlst:
            wlst[x.replace('ae', 'e')] += wlst[x]
            wlst.pop(x) """
        if any(y in x for y in rew) and x.replace('re', 'er') in wlst:
            wlst[x.replace('re', 'er')] += wlst[x]
            wlst.pop(x)
        if any(y in x for y in oew) and x.replace('oe', 'e') in wlst:
            wlst[x.replace('oe', 'e')] += wlst[x]
            wlst.pop(x)
        if 'ise' in x and x.replace('ise', 'ize') in wlst:
            wlst[x.replace('ise', 'ize')] += wlst[x]
            wlst.pop(x)
        if 'isa' in x and x.replace('isa', 'iza') in wlst:
            wlst[x.replace('isa', 'iza')] += wlst[x]
            wlst.pop(x)
        if 'isi' in x and x.replace('isi', 'izi') in wlst:
            wlst[x.replace('isi', 'izi')] += wlst[x]
            wlst.pop(x)
        if 'logue' in x and x.replace('logue', 'log') in wlst:
            wlst[x.replace('logue', 'log')] += wlst[x]
            wlst.pop(x)
        if 'logu' in x and x.replace('logu', 'log') in wlst:
            wlst[x.replace('logu', 'log')] += wlst[x]
            wlst.pop(x)
    return wlst

def edge_scan(words, side, depth=0, thresholds=None, merge=True):
    if side not in ('r', 'l'): raise ValueError('Invalid Side')
    if not isinstance(depth, int) or depth < 2: raise ValueError("Invalid Depth")
    if thresholds:
        if not isinstance(thresholds, Container) or any(not isinstance(y, int) for y in thresholds): raise ValueError("Invalid Thresholds")
    if not depth:
        depth = int(np.average([len(x) for x in words]))
        depth += (2 if depth > 4 else 1)
    else: depth += 1

    ecnt = Counter()
    for w in words[1:]:
        for i in range(2, depth):
            tgt = (w[-i:] if side == 'r' else w[:i])
            if ' ' in tgt: break
            else: ecnt[tgt] += 1
    for x in [x[0] for x in ecnt.most_common() if x[1] < (3 if thresholds else 2)]: ecnt.pop(x)

    if merge:
        fltr, vmerge = [], []
        for x in [x for x in ecnt.most_common() if (x[0][0] if side == 'r' else x[0][-1]) not in ('a', 'e', 'i', 'o', 'u')]:
            matches = [y for y in ecnt if x[0] in y and len(y) == len(x[0])+1 and (y[0] if side == 'r' else y[-1]) in ('a', 'e', 'i', 'o', 'u')]
            if len(matches) > 1:
                if sum([ecnt[y] for y in matches]) > (x[1]*0.85 if x[1] >= 50 else (x[1]-5 if x[1] > 12 else x[1]-3)):
                    fltr.extend(matches)
                    vmerge.append((x[0], x[1]))
        for x in fltr: ecnt.pop(x)
        
        fltr = []
        for x in ecnt.most_common():
            matches = [y for y in ecnt if x[0] != y and x[0] in y and y not in fltr]
            for y in matches:
                if x[1] == ecnt[y]:
                    fltr.append(y)
                    break
                if ecnt[y] > (x[1]*0.9 if x[1] >= 50 else (x[1]-3 if x[1] > 12 else x[1]-2)):
                    fltr.append(y)

    if thresholds and isinstance(thresholds, Container):
        thresholds = {i+2: x for i, x in enumerate(thresholds)}
        if len(thresholds) < depth:
            for i in range(len(thresholds)+2, depth):
                thresholds[i] = 3
    elif thresholds == True:
        cs = (min(max(2**(16/len(words)), 0), 2) - 1)**0.333
        if cs < 0.10: thresholds = [int(max(min(950/(y**2.22), 768), 3)) for y in range(1, depth+1)]
        elif cs > 0.60: thresholds = [int(max(min((x/4)/(y**2.22), 16), 3)) for y in range(1, depth+1)]
        else:  thresholds = [int(max(min((0.75 if y == 1 else 1) * x*cs / (y**2.22), 768), 3)) for y in range(1, depth+1)]
    else: thresholds = {i: 2 for i in range(2, depth+1)}

    if merge:
        for x in vmerge:
            if x[0] in ecnt:
                ecnt.pop(x[0])
                ecnt[(f'_{x[0]}' if side == 'r' else f'{x[0]}_')] = x[1]
        return [x for x in ecnt.most_common() if x[0] not in fltr and x[1] >= thresholds[len(x[0])]]
    else: return [x for x in ecnt.most_common() if x[1] >= thresholds[len(x[0])]]



class AffixAnalyzer:

    def __init__(self, words, load_id: int=0):
        self.ldct = {
            'alpha': {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'},
            '0afx': {'s_', 'd_', 'r_', 'n_', 't_', 'y_'},
            '1afx': {'s_', 'd_', 'r_', 'n_', 't_', 'y_', '_a', 'a_', '_o', 'o_', 'i_', '_e'},
            '2afx': {
                    "_de", "_di", "_bi", "_co", "_en", "_in", "_re", "_un", 
                    "_ab", "_ad", "_be", "_ec", "_em", "_ex", "_im", "_ob", 
                    "er_", "es_", "ed_", "ic_", "al_", "ry_", "ly_", 
                    "ar_", "cy_", "ee_", "en_", "ia_", "ie_", "or_", "um_",
                    "_up", "up_", "_on", "_by"
            },
            'fdbl': {'b', 'c', 'd', 'f', 'g', 'l', 'm', 'n', 'p', 'r', 's', 't'},
            'bdbl': {'b', 'd', 'g', 'm', 'n', 'p', 'r', 't'},
            'vwl1': {'a', 'e', 'i', 'o', 'u', 'y'},
            'vwl2': {'a', 'e', 'i', 'o', 'u'},
            'vwl3': {'a', 'e', 'o', 'i', 'y'},
            'vwl4': {'a', 'e', 'o', 'u'},
            'vwl5': {'a', 'o', 'i', 'u'},
            'vwl6': {'i', 'u'},
            'vwl7': {'e', 'o', 'a'},
            'readdf': {'_ill', '_app', '_aff', '_irr', '_att', '_agg', '_opp', '_ass', '_all', '_ann', '_eff', '_acc'},
            'bridges': {'emat', 'isat', 'izat', 'ibil', 'ula', 'at', 'an', 'ar', 'ti', 'a', 'e', 'i', 'o', 'u'}
        }
        self.verif = [
            '_electr', '_econom', '_neuro', '_hydro', '_chrom', '_onto', '_onco', '_kine', '_lys', '_eco', '_bio', '_geo',
            '_super', '_supra', '_under', '_trans', '_ortho', '_intra', '_inter', '_vert', '_over', '_fore', '_sup', '_sub', '_out', '_off', '_mid',
            '_multi', '_micro', '_hyper', '_hypo', '_semi', '_poly', '_mono', '_uni', '_iso', '_lat', '_dia',
            '_counter', '_pseudo', '_para', '_meta', '_auto', '_anti', '_pro', '_pre', '_non', '_mis', '_epi', '_sym', '_con', '_com', '_ant', '_ana', '_dis',
            'cide_', 'ment_', 'logy_', 'cian_', 'less_', 'ness_', 'ance_', 'ence_', 'able_', 'ible_', 'ular_',
            'tion_', 'sion_', 'ing_', 'ism_', 'ish_', 'ist_', 'ise_', 'ize_', 'ive_', 'ium_', 'ian_', 'ile_', 
            'ate_', 'ant_', 'ent_', 'est_', 'eum_', 'ean_', 'eur_', 'our_', 'ous_', 'oid_', 'sis_', 'ful_', 
            '_opp', '_irr', '_ill', '_eff', '_att', '_ass', '_app', '_all', '_agg', '_aff', '_acc', '_ad', 
            '_ab', '_an', '_ob', '_ec', '_en', '_ex', '_em', '_in', '_im',
            'es_', 'er_', 'or_', 'ed_', 'ic_', 'al_', 'fy_', 'ty_', 'cy_', 'ly_', 'ry_', 'ia_', 'ie_', 'um_', 'en_', 'ar_', 'ee_',
            '_up', '_on', '_by', '_be', 'up_', '_un', '_re', '_di', '_de', '_co', '_bi', '_e', 'd_', 'r_', 's_', 'y_'
        ]
        self.roots = [
            '_lymph_', '_metre_', '_meter_', '_metry_', '_graph_', '_photo_', '_sume_', '_cede_', '_ceed_', '_ecto_', '_tone_', '_fish_', '_form_', 
            '_ship_', '_man_', '_men_', '_var_', '_max_', '_min_', '_lyr_', '_gress_', '_cess_', '_fess_', '_press_'
        ]
        self.cterms = {x for x in self.verif if len(x) > 3}
        self.averif = {*self.ldct['2afx'], *self.ldct['1afx'], *self.verif}
        self.cleared, self.failed_brk, self.final = Counter(), Counter(), Counter()
        self.afxscore, self.wparts = dict(), []
        self.dbg = False
        if load_id:
            self.load(load_id)
            self.bare = {x.strip('_') for x in self.full_scores}
            self.default_search = self.full_scores
        else:
            self.wlst = Counter({f'_{x[1]}_': int(x[0]) for x in words[::-1]})
            self.full_scores = self.wlst.copy()
            self.default_search = self.full_scores
            self.bare = {x.strip('_') for x in self.full_scores}
            self.pre_clean()
            self.create_afx('w', 11, 2)
            self.post_clean('w')
            self.prep_entropy_calc()

    def create_afx(self, method='w', vmax: int=0, vmin: int=0, rmax: int=7) -> None:
        """
        Create list of affixes via 1 of 2 methods.
        w: Window method moves a window of various sizes over all words and counts occurances of affixes
        r: Remainder method for all words, find nested words and removes the inner word from all containing words and counts the remaining affixes

        w ex: '_retracting_' -> ret, etr, tra, rac, act, cti, tin, ing
        r ex: 'firm' | 'reaffirmed', 'confirming' -> reaf, ed, con, ing

        Args:
            vmax (int, optional): Maximum window size for w method || Maximum inner word length for r method. Defaults to 10 for w | 12 for r.
            vmin (int, optional): Minimum window size for w method || Minimum inner word length for r method. Defaults to 2 for w | 4 for r.
            rmin (int, optional): Minimum word length for outer words. r method only. Defaults to 7.
        """
        self.afx = Counter()
        if method == 'w':
            if not vmax: vmax = 10
            if not vmin: vmin = 2

            for word in self.wlst:
                wln = min(len(word), vmax)
                for n in range(vmin, wln-1):
                    for pos in range(wln-n+1):
                        self.afx[word[pos:pos+n]] += self.wlst[word]

        elif method == 'r':
            if not vmax: vmax = 12
            if not vmin: vmin = 4

            smalls = {x for x in self.wlst if len(x) > vmin and len(x) < vmax}
            bigs = {x for x in self.wlst if len(x) > rmax}
            for x in tqdm(smalls):
                x = x.strip('_')
                group = [y for y in bigs if x in y]
                for y in group:
                    out = y.split(x)
                    if len(out) > 2:
                        out.append(f'_{out[1]}')
                        out.append(f'{out[1]}_')
                        out.pop(1)
                    for z in out:
                        if z not in ('_', ''):
                            self.afx[z] += 1
        print('Frags Created', len(self.afx))

    def pre_clean(self) -> None:
        #Remove ' words from word list
        for x in ["_ain't_", "_can't_", "_won't_", "_shan't_"]:
            self.final["n't_"] += self.wlst[x]
        for x in ["_i'm_", "_can't_"]:
            self.wlst[f'{x[:-3]}_'] += self.wlst[x]
        for x in ["_ma'am_", "_ain't_", "_i'm_"]:
            self.final[x] += self.wlst[x]
        for x in ["_ain't_", "_can't_", "_won't_", "_shan't_", "_ma'am_", "_i'm_", "_van't_"]:
            self.wlst.pop(x)
        for x in [z for z in self.wlst if "'" in z if any(y in z for y in ["'s_", "'ll_", "'ve_", "n't_", "'re_", "'d_"])]:
            for efx in ["'s_", "'ll_", "'ve_", "n't_", "'re_", "'d_"]:
                if efx in x:
                    self.final[efx] += self.wlst[x]
                    self.wlst[x.replace(efx, '_')] += self.wlst[x]
                    self.wlst.pop(x)
        for x in [z for z in self.wlst if "'" in z]:
            self.wlst.pop(x)

    def post_clean(self, regime: str, cdist: int=0, cmin: int=0) -> None:
        """
        Clean affix list, remove noise and words that will be tokenized as wholes
        
        Args:
            cdist (int, optional): Distance to search from a sorted affix list for nested affix cleaning. Defaults to 2048 for w | 512 for r.
            cmin (int, optional): Minimum occurrences to keep an affix in the list. Defaults to 64 for w | 8 for r.
        """
        if regime == 'w':
            if not cdist: cdist = 2048
            if not cmin: cmin = 32
        elif regime == 'c':
            if not cdist: cdist = 512
            if not cmin: cmin = 8

        for x in self.wlst.most_common():
            if x[1] > 3000000: self.cleared[x[0]] = x[1] #Words occurance > 3M
            elif x[1] > 100000 and len(x[0]) < 6: self.cleared[x[0]] = x[1] #Words, < 4 chars, occurance > 100k
            elif x[1] < 100000: break
        for z in [y for y in self.wlst if len(y) < 5]: #Words 2 letters or less
            if self.wlst[z] > cmin: self.cleared[z] += self.wlst[z] 
            else: self.wlst.pop(z)
        for z in self.cleared: #Remove cleared words
            if z in self.wlst: self.wlst.pop(z)

        #Scan from most common to least, if a nested affix is found within 2048 items, subtract that items value from the current affix
        afidx = [x[0] for x in self.afx.most_common() if '_' in x[0]]
        for i, x in enumerate(afidx):
            group = [y for y in afidx[i+1:i+1+cdist] if x in y]
            if group: self.afx[x] -= self.afx[group[0]]
        self.afx = Counter({x[0]: x[1] for x in self.afx.most_common() if x[1] > cmin})

        for z in [x for x in self.afx if '_' not in x and len(x) < 3]:
            self.afx.pop(z) #Unattached affixes < 3 chars
        for z in [x for x in self.afx if not any(y in x for y in self.ldct['vwl1']) and x not in self.ldct['1afx'] and x not in self.ldct['2afx']]:
            self.afx.pop(z) #Affixes with no vowels
        for z in [x for x in self.afx if len(x) < 4 and '_' in x and x not in self.averif]:
            self.afx.pop(z) #Affixes < 3 chars that arent in pre verified list
        for z in {x[0] for x in [(x, f'_{x}', f'{x}_', f'_{x}_') for x in self.afx if '_' not in x] if (x[1] in self.afx or x[2] in self.afx or x[3] in self.cleared or x[3] in self.final or x[3] in self.wlst)}:
            self.afx.pop(z) #Unattached affixes with that have an attached variant
        for z in tqdm([x for x in self.afx]):
            c = 0
            for y in self.wlst:
                if z in y: c += 1
                if c > 3: break
            else:  self.afx.pop(z)

        for x in ('less_', 'ness_'): self.target_removal(x, exe=True)
        self.target_removal('es_', exc1=('es_', 's_'), exc2=('is_', 'us_', 'ss_', 'series_', 'species_'), exe=True)
        self.target_removal('s_', exc1=('es_', 's_'), exc2=('is_', 'us_', 'ss_', 'series_', 'species_'), exe=True)
        self.target_removal('er_', exc2=('meter_', 'over_', 'under_', 'master_'), exe=True)
        self.target_removal('or_', exc2=('oor_'), exe=True)
        self.target_removal('ed_', exc2=('eed_'), exe=True)
        for x in ('en_', 'ly_', 'ion_', 'ous', 'ing_', 'ity_', 'ize_', 'ise_', 'ive_', 'ist_', 'ism_', 'ory_', 'est_', 'ment_', 'ant_', 'ary_', 'ate_', 'ic_', 'al_'): self.target_removal(x, exe=True)
        self.target_removal('y_', exc2=('ity_', 'ry_', 'ly_', 'ory_', 'ary_'), exe=True)

    def _prep_afx(self) -> None:
        with open(r'D:\dstore\nlp\w2v\directions', 'rb') as f:
            directions = pickle.load(f)
        self.afx['_a'] += self.wlst["_around_"]
        self.afx["_round"] += self.wlst["_around_"]
        self.afx["_o"] += self.wlst["_over_"]
        self.afx["_ver"] += self.wlst["_over_"]
        for x in directions[0]:
            self.afx[x[:-1]] += self.wlst[x]
        for x in directions[2]:
            self.afx[x[:-1]] += self.wlst[x]
            self.afx[x[1:]] += self.wlst[x]
        for x in directions[1]:
            self.afx[x[1:]] += self.wlst[x]
        for y in directions:
            for x in y:
                if len(x) < 5:
                    self.final[x] += self.wlst[x]
                    self.wlst.pop(x)
                else:
                    self.cleared[x] += self.wlst[x]
                    self.wlst.pop(x)

    def prep_entropy_calc(self, over_length: int=7, pull_cutoff: int=2, wgts: list[list[int]]=[[1, 1, 1.25], [1, 1.25, 1.5], [1.25, 1.5, 1.75]]) -> None:
        """
        Goal: 
            To filter out fragments of affixes from whole affixes
        Hypothesis: 
            The distribution of letters adjacent to an affix will help me determine whether or not an affix is whole or not.
            Partial affixes will have much lower entropy in atleast one of the measurements because the letter that completes the affix will dominate the distribution.
            Limiting the sampling window size will give amplify the entropies.
        Example: 
            ng_ is a partial affix of ing_.
            When sampling letters to the left of ng_, the letter 'i's dominate the distribution.
            When the window size of is set to 1, the distribution of letters will be almost entirely 'i' giving a very high relative entropy value.
            In contrast with the distribution of ing_ the distribution will be much closer to the general distribution of the whole data set
        Args:
            over_length (int, optional): Maximum length for standard affixes. Affixes longer than this value will have their character distributions separated. Should be slightly over half the length of the average word. Defaults to 7.
            pull_cutoff (int, optional): Maximum difference in affix length when searching for parent / child affix nodes. Defaults to 2.
            wgts (list[list[int]], optional): 3x3 Weight matrix for scaling direction and window size of relative entropy calculations. Defaults to [[1, 1, 1.25], [1, 1.25, 1.5], [1.25, 1.5, 1.75]].
        Returns:
            re_arr, dsts, rntp, drntp dictionaries added to instance
        """
        dsts, pd, sd, nd = {}, Counter(), Counter(), Counter()
        for x in self.wlst: # Get letter distributions for: letters in front half of words, letters in back half of words, all letters
            x = x.strip('_')
            for l in x: nd[l] += 1
            i = round((len(x)+0.1) / 2)
            for l in x[:i]: pd[l] += 1
            for l in x[-i:]: sd[l] += 1
        pd = Counter({x[0]: x[1] / pd.total() for x in pd.most_common()})
        sd = Counter({x[0]: x[1] / sd.total() for x in sd.most_common()})
        nd = Counter({x[0]: x[1] / nd.total() for x in nd.most_common()})

        rntp = {} # Calculate the relative entropy of letters adjacent to an affix
        for x in tqdm(self.afx): 
            hold = []
            if '_' not in x:
                fd = nd
                for i in range(1, 4): #Define sampling window size
                    o1, o2 = self.surrounds(x, i)
                    hold.append([self.kld(self.surrounds(x, i, merge=True), fd), self.kld(o1, fd), self.kld(o2, fd)])
            else:
                if x[0] == '_': pre = True
                else: pre = False
                if (pre and len(x) > over_length) or (not pre and len(x) <= over_length): fd = sd
                else: fd = pd
                for i in range(1, 4):
                    o1, o2 = self.surrounds(x, i) # Define direction of window here
                    if pre: side = o2 
                    else: side = o1
                    hold.append([self.kld(self.surrounds(x, i, merge=True), fd), self.kld(side, fd), self.kld(self.surrounds(x, i, exact=True), fd)])
            rntp[x] = np.array(hold[::-1]).T

        re_arr, hold = {}, [] # Get letter distributions for the first and last: 1, 2, 3 letters
        for i in range(1, 4):
            frel, brel = Counter(), Counter()
            for x in self.wlst:
                if len(x) >= 3+i:
                    x = x.strip('_')
                    for l in x[-i:]: brel[l] += 1
                    for l in x[:i]: frel[l] += 1
            hold.append([self.kld(frel, pd), self.kld(brel, sd)])
            dsts[f'pd{i}'] = Counter({x[0]: x[1] / frel.total() for x in frel.most_common()})
            dsts[f'sd{i}'] = Counter({x[0]: x[1] / brel.total() for x in brel.most_common()})
        hold = [[x]*3 for x in np.array(hold[::-1]).T]
        #These will be used for calculating the derivatives of root/leaf affixes
        re_arr['pd3'], re_arr['sd3'] = np.array(hold[0]), np.array(hold[1])
        re_arr['lpd3'], re_arr['lsd3'] = np.array([rntp[x] for x in self.afx if len(x) > over_length and x[0] == '_']).mean(axis=0), np.array([rntp[x] for x in self.afx if len(x) > over_length and x[-1] == '_']).mean(axis=0)

        drntp = {}
        for x in tqdm(self.afx): # Derivative of relative entropy values along a sequential chain of affixes
            if '_' in x: #Chains can only beformed with positional affixes
                above, below = self.pulld(x, pull_cutoff), self.pullu(x)
                if above: above = np.array([rntp[y] for y in above]).mean(axis=0)
                else: #If an affix has no affixes above it, use the averaged relative entropies for affixes longer than 6 letters to calculate
                    above = self.pulld(x)
                    if above: above = np.array([rntp[y] for y in above]).mean(axis=0)
                    elif x[0] == '_': above = re_arr['lpd3']
                    elif x[-1] == '_': above = re_arr['lsd3']
                if below: below = rntp[below]
                elif x[0] == '_': below = re_arr['pd3']
                else: below = re_arr['sd3']
                middle = rntp[x]
                drntp[x] = (above-middle) - (middle-below)

        #Find the average derivatives for edge cases
        wgts = np.array(wgts)
        re_arr['lpd3x'], re_arr['lsd3x'] = re_arr['lpd3']*wgts, re_arr['lsd3']*wgts
        re_arr['pd3x'], re_arr['sd3x'] = re_arr['pd3']*wgts, re_arr['sd3']*wgts
        re_arr['dlpd3'], re_arr['dlsd3'] = np.mean([(rntp[x] * wgts) - re_arr['lpd3x'] for x in self.afx if x[0] == '_' and len(x) > over_length], axis=0), np.mean([(rntp[x] * wgts) - re_arr['lsd3x'] for x in self.afx if x[-1] == '_' and len(x) > over_length], axis=0)
        re_arr['dpd3'], re_arr['dsd3'] = np.mean([(rntp[x] * wgts) - re_arr['pd3x'] for x in self.afx if x[0] == '_' and len(x) <= over_length], axis=0), np.mean([(rntp[x] * wgts) - re_arr['sd3x'] for x in self.afx if x[-1] == '_' and len(x) <= over_length], axis=0)
        dsts['pd'], dsts['sd'], dsts['nd'] = pd, sd, nd
        re_arr['wgts'], re_arr['null'] = wgts, [np.array([[0]*3]*3), np.array([[0]*3]*3)]
        self.re_arr, self.dsts, self.rntp, self.drntp = re_arr, dsts, rntp, drntp

    def get_compounds(self):
        combos = {}
        for x in self.wlst:
            front, back = [], []
            for y in self.wlst:
                if y != x:
                    f, b = y[1:], y[:-1]
                    if f in x and (len(x) - x.index(f) - len(f)) == 0:
                        front.append(f)
                    if b in x and x.index(b) == 0:
                        back.append(b)
            veri = []
            if front and back:
                for m in front:
                    for n in back:
                        if len(m) + len(n) == len(x):
                            veri.append((m, n))
            if veri:
                combos[x] = tuple(veri)
        return combos

    def search(self, term: str, corpus: Container=None, exc: str|tuple[str]=None, pos: bool=False, sfil=False, svar='i'):
        #Returns all items that contain the input affix
        if not corpus: corpus = self.default_search
        if svar == 's': res = [x.strip() for x in lrsort([x for x in corpus if x.startswith(term)])]
        elif svar == 'e': res = [x.strip() for x in rrsort([x for x in corpus if x.endswith(term)])]
        else: res = sorted([x for x in corpus if term in x])

        if exc: res = [x for x in res if all(y not in x for y in ((exc,) if isinstance(exc, str) else exc))]
        if sfil: res = [x for x in res if x not in {f'_{term.strip("_")}', f'{term.strip("_")}_', f'_{term.strip("_")}_', term}]
        if pos: res = [x for x in res if '_' in x]
        return res
    
    def is_sub(self, orig, rep):
        orig, rep = orig.strip('_'), rep.strip('_')
        rslt = [x.split(rep) for x in self.search(rep) if x != f'_{rep}_']
        orslt = [x.split(orig) for x in self.search(orig) if x != f'_{orig}_']
        if len(rslt) > 3 and len(orslt) > 3:
            if len([x for x in orslt if x in rslt]) > 1: return False
        out = {y for x in rslt for y in x if y != '_' and y in self.verif}
        if (rslt and out) and (len(out) > 7 or len(out) / len(rslt) >= 0.5 or (len(out) / len(rslt) >= 0.15 and len(out) > 2)): return True
        else: return False

    def gsub(self, target: str, afx: str, amode: int=0, best: bool=True, fltr: bool=True) -> str:
        """
        Remove the affix from a word following english rules, returning the proper root
        Will not work if the remaining word/affix is too short (2 chars for word, 1 char for affix)

        Args:
            target (str): Target word to remove affix from
            afx (str): Affix to remove from target word
            best (bool, optional): Whether to return the best option or all options. Defaults to True.
            amode (int, optional): Verification list. 0 verifies against words list. 1 verifies against affix list. 2 verifies against both with affix and co-affix. Defaults to 0.

        Returns:
            (str): Target with affix removed in its neutral form
        """
        if not amode and len(target) - len(afx) < 3: return
        elif amode and len(target) - len(afx) < 2: return
        spafx = {'sion_', 'ian_', 'es_', 'cy_', 's_', 'y_'}
        rep = target.replace(afx, '')
        candidates = [target.replace(afx, '_')]
        if afx[0] == '_': pre = True
        else: pre = False

        if afx in spafx: #Specific Affix Substitution Rules
            if afx == 'sion_':
                if rep.endswith('is'): candidates.append(f'{rep[:-1]}t_')
                elif rep[-1] == 'n': candidates.append(f'{rep[:-1]}d_')
                elif rep[-1] == 'r': candidates.append(f'{rep}t_')
                elif rep[-1] in self.ldct['vwl2']:
                    candidates.append(f'{rep}de_')
                    candidates.append(f'{rep}re_')
            elif afx == 'tion_':
                if rep.endswith('lu'):
                    candidates.append(f'{rep[:-1]}ve_')
            elif afx == 'ian_':
                if rep.endswith('ar'):
                    candidates.append(f'{rep[:-2]}_')
            elif afx == 'cy_':
                candidates.append(f'{rep}t_')
                candidates.append(f'{rep}ce_')
                if rep[-1] == 'a': candidates.append(f'{rep}te_')
            elif afx == 'es_':
                if rep[-1] == 'v':
                    candidates.append(f'{rep[:-1]}f_')
                    candidates.append(f'{rep[:-1]}fe_')
                elif rep.endswith('ic'):
                    candidates.append(f'{rep[:-2]}ex_')
            elif afx == 's_':
                if rep[-1] in ['s', 'i', 'u']: return
            elif afx == 'y_':
                if rep[-1] in self.ldct['vwl2']: return

        if pre:
            if len(afx) == 2:
                if f'_{rep}' in self.full_scores: return f'_{rep}'
                else: return

            if afx[-1] not in self.ldct['vwl2']: candidates.append(f'_{afx[-1]}{rep}')
            #if rep[0] in self.ldct['vwl2']: candidates.append(f'_{rep[1:]}')
            else:
                if len(rep) > 4 and rep[0] == rep[1] and rep[0] in self.ldct['fdbl']:
                    candidates.append(f'_{rep[1:]}')

        else:
            if len(afx) == 2:
                if f'{rep}_' in self.full_scores: return f'{rep}_'
            elif len(afx) > 2: candidates.append(f'{rep}e_')

            if afx[0] not in self.ldct['vwl2']:
                candidates.append(f'{rep}{afx[0]}_')
                candidates.append(f'{rep}{afx[0]}e_')
                #v cfx_
                if rep[-1] in self.ldct['vwl2']:
                    candidates.append(f'{rep[:-1]}_')
                    candidates.append(f'{rep[:-1]}e_')
                    if rep[-1] == 'i': candidates.append(f'{rep[:-1]}y_')
                #c cfx_
                else: pass
            else:
                #v vfx_
                if rep[-1] in self.ldct['vwl2']:
                    candidates.append(f'{rep[:-1]}_')
                    candidates.append(f'{rep[:-1]}e_')
                    if rep[-1] == 'i': candidates.append(f'{rep[:-1]}y_')
                #c vfx_
                else:
                    if len(rep) > 4 and rep[-1] == rep[-2] and rep[-1] in self.ldct['bdbl']:
                        candidates.append(f'{rep[:-1]}_')

        if target in candidates: candidates.remove(target)
        if candidates and fltr:
            candidates = set(candidates)
            if amode == 0: out = sorted([(x, 50000) if x in self.roots else (x, self.full_scores[x]) for x in candidates if x in self.full_scores], key=lambda x: np.log(x[1] * (len(x[0])-1)))
            elif amode == 1: out = sorted([(x, self.afx[x]) for x in candidates if x in self.afx], key=lambda x: np.log(x[1] * (len(x[0])-1)))
            elif amode == 2:
                out = []
                if pre:
                    for x in candidates:
                        mafx, full = f'{x[1:]}_', f'{x}_'
                        if mafx in self.afx and self.afx[mafx] > 8: out.append((mafx, self.afx[mafx]))
                        elif full in self.wlst and self.wlst[full] > 256: out.append((full, np.log2(self.wlst[full])))
                else:
                    for x in candidates:
                        mafx, full = f'_{x[:-1]}', f'_{x}'
                        if mafx in self.afx and self.afx[mafx] > 8: out.append((mafx, self.afx[mafx]))
                        elif full in self.wlst and self.wlst[full] > 256: out.append((full, np.log2(self.wlst[full])))
                out = sorted(out, key=lambda x: np.log(x[1] * (len(x[0])-1)))
            if out:
                if best: return out[-1][0]
                else: return out
        else: return candidates

    def target_removal(self, afx: str, exc1: str|tuple[str]=None, exc2: str|tuple[str]=None, exe: bool=False) -> list[str] | None:
        """
        Find affixes that contain the input affix and runs the gsub method on them. If the 'exe' paremeter is set to true, found affixes will be removed.

        Args:
            afx (str): Affix to search/remove.
            exc1 (str | tuple[str], optional): Exact affixes to exclude from removal. Defaults to None.
            exc2 (str | tuple[str], optional): Affixes to filter for affixes to exclude from removal. Defaults to None.
            exe (bool, optional): Remove found affixes. Defaults to False.

        Returns:
            list (str): List of substitutions for target affix
        """
        if exc1 and exc2:
            if isinstance(exc1, str) and isinstance(exc2, str): targets = [x for x in self.afx if afx in x and x not in (afx, exc1) and exc2 not in x]
            elif isinstance(exc1, str): targets = [x for x in self.afx if afx in x and x not in (afx, exc1) and all(y not in x for y in exc2)]
            elif isinstance(exc2, str): targets = [x for x in self.afx if afx in x and x not in (afx, *exc1) and exc2 not in x]
            else: targets = [x for x in self.afx if afx in x and x not in (afx, *exc1) and all(y not in x for y in exc2)]
        elif exc1:
            if isinstance(exc1, str): targets = [x for x in self.afx if afx in x and x not in (afx, exc1)]
            else: targets = [x for x in self.afx if afx in x and x not in (afx, *exc1)]
        elif exc2:
            if isinstance(exc2, str): targets = [x for x in self.afx if afx in x and x != afx and exc2 not in x]
            else: targets = [x for x in self.afx if afx in x and x != afx and all(y not in x for y in exc2)]
        else: targets = [x for x in self.afx if afx in x and x != afx]

        if exe:
            for x in targets:
                tmp = self.gsub(x, afx, amode=1)
                if tmp: 
                    self.afx[tmp] += self.afx[x]
                    self.afx.pop(x)
        else: return [res for x in targets if (res := self.gsub(x, afx, amode=1))]

    def pulld(self, afx: str, len_lim: int=0) -> list[str]:
        """
        Finds all child nodes of the input affix

        Args:
            afx (str): Input affix
            len_lim (int, optional): Maximum difference in length between input and output nodes. 0 permits any difference. Defaults to 0.

        Returns:
            list[str]: List of all child nodes
        """
        sub_set = [x for x in self.afx if afx in x and x != afx]
        if '_' not in afx: sub_set = [x for x in sub_set if '_' not in x]
        out, aln = set(), len(afx)
        for x in sub_set:
            i = 1
            if '_' not in x:
                ti, wl = x.index(afx), len(x)
                pf = wl-(ti+aln)
                while i <= max(pf, ti):
                    if x[ti-i:ti+aln] in sub_set and x != x[ti-i:ti+aln]: break
                    elif x[ti:ti+aln+i] in sub_set and x != x[ti:ti+aln+i]: break
                    i += 1
                else: out.add(x)
            elif x[0] == '_':
                while len(x[:-i]) > aln:
                    if x[:-i] in sub_set: break
                    i += 1
                else: out.add(x)
            else:
                while len(x[i:]) > aln:
                    if x[i:] in sub_set: break
                    i += 1
                else: out.add(x)
        if not len_lim: return out
        else: return [x for x in out if len(x) <= aln+len_lim]

    def pullu(self, afx: str) -> str:
        #Return the parent node of the input affix
        i = 1
        if '_' not in afx:
            while i < len(afx):
                hold = []
                if afx[:-i] in self.afx: hold.append(afx[:-1])
                elif afx[i:] in self.afx: hold.append(afx[i:])
                if len(hold) > 1: return hold
                elif hold: return hold[0]
                i += 1
        elif afx[0] == '_':
            while i < len(afx):
                if afx[:-i] in self.afx: return afx[:-i]
                i += 1
        else:
            while i < len(afx):
                if afx[i:] in self.afx: return afx[i:]
                i += 1

    def chain(self, afx: str) -> list[str]:
        #Return all nodes along the longest possible path that contains this affix node
        out = sorted([x for x in self.afx if afx in x or x in afx], key=lambda x: len(x))
        if out:
            out = out[-1]
            if '_' in afx: return sorted([x for x in self.afx if x in out and '_' in x], key=lambda x: len(x), reverse=True)
            else: return sorted([x for x in self.afx if x in out and '_' not in x], key=lambda x: len(x), reverse=True)

    def surrounds(self, afx: str, window: int=3, merge: bool=False, exact: bool=False) -> tuple[Counter] | Counter:
        """
        Counts the letters adjacent to the input affix in all words from the word list.
        By default the input affix will have its positional indicator _ removed.

        Args:
            afx (str): Target affix.
            window (int, optional): Distance from input affix to count. Defaults to 3.
            merge (bool, optional): Combine left and right side counts before returning. Defaults to False.
            exact (bool, optional): Counted words must respect affixes positional indicator. Defaults to False.

        Returns:
            tuple[Counter] | Counter: Counts of letters adjacent to input affix.
        """
        left_cnt, right_cnt = Counter(), Counter()
        if not exact:
            afx = afx.strip('_')
            targets = [x.strip('_').split(afx) for x in self.wlst if afx in x]
        else:
            targets = [x.split(afx) for x in self.wlst if afx in x]
            targets = [(x[0].strip('_'), x[1].strip('_')) for x in targets]

        if not exact or (exact and afx[-1] == '_'):
            for x in targets:
                idx = min(len(x[0]), window)
                if idx: left_cnt[x[0][-idx:]] += 1
        if not exact or (exact and afx[0] == '_'):
            for x in targets:
                idx = min(len(x[1]), window)
                if idx: right_cnt[x[1][:idx]] += 1
        if merge or exact:
            for x in left_cnt: right_cnt[x] += left_cnt[x]
            return right_cnt
        else: return Counter({x[0]: x[1] for x in left_cnt.items() if len(x[0]) == window}), Counter({x[0]: x[1] for x in right_cnt.items() if len(x[0]) == window})

    def remean(self, rearr: np.ndarray) -> np.ndarray:
        #Returns the mean array of the input array for each column and row
        return np.array([*[np.mean(y) for y in rearr], *[np.mean(y) for y in rearr.T]])

    def kld(self, P: Counter, Q: Counter=None, pfloor: int=0) -> float:
        """
        Kullback Leibler Divergence Calculation

        Args:
            P (Counter): Counts of letters
            Q (Counter, optional): Letter counts or distribution to compare against P. Defaults to letter distribution of entire the word list.
            base_value (int, optional): Base count of all letters. Higher values reduces effect of 0s. Defaults to 3.

        Returns:
            float: Relative Entropy of the two counts / distributions.
        """
        if not pfloor:
            if P.total() < 156: pfloor = 1
            elif P.total() < 312: pfloor = 2
            else: pfloor = 3
        if not Q: Q = self.dsts['nd']

        pcnt = Counter({x: pfloor for x in self.ldct['alpha']})
        for x in P:
            for y in x: pcnt[y] += P[x]  
        psum = sum(x for x in pcnt.values())
        if Q.total() > 1.5:
            qcnt = Counter({x: pfloor for x in self.ldct['alpha']})
            for x in Q:
                for y in x: qcnt[y] += Q[x]
            for x in self.ldct['alpha']:
                if pcnt[x] == pfloor and qcnt[x] == pfloor:
                    pcnt.pop(x)
                    qcnt.pop(x)
            qsum = sum(x for x in qcnt.values())
            return sum([(pcnt[x] / psum) * np.log2((pcnt[x] / psum) / (qcnt[x] / qsum)) for x in pcnt])
        else: return sum([(pcnt[x] / psum) * np.log2((pcnt[x] / psum) / Q[x]) for x in pcnt])

    def relent_peaks(self, afx: str, bridge_coeff: float=1.0, over_length: int=7) -> list[str, float]:
        """
        Find affixes with significant variations from their parent/child nodes relative entropy.
        Affix chain will be the longest chain that contains the input affix.
        Specific chains can be targetted by inputting the longest affix of a chain.
        Affixes returned indicate a target of interest for removal.

        Args:
            afx (str): Target affix
            bridge_coeff (int, optional): Change in relative entropy to be considered significant. Defaults to 1.0.
            over_length (int, optional): Maximum length for standard affixes. Affixes longer than this value will have their character distributions separated. Should be slightly over half the length of the average word. Defaults to 7.

        Returns:
            list[str, float]: List of affixes with significant relative entropy spikes
        """
        if afx[0] == '_':
            if len(afx) > over_length: scores = [self.re_arr['lpd3'].copy(), self.re_arr['lpd3'].copy()]
            else: scores = [self.re_arr['pd3'].copy(), self.re_arr['pd3'].copy()]
        elif afx[-1] == '_':
            if len(afx) > over_length: scores = [self.re_arr['lsd3'].copy(), self.re_arr['lsd3'].copy()]
            else: scores = [self.re_arr['sd3'].copy(), self.re_arr['sd3'].copy()]

        words = self.chain(afx)
        for x in words: scores.append(self.rntp[x] * self.re_arr['wgts'])
        if afx[0] == '_': scores.append(self.re_arr['pd3'])
        else: scores.append(self.re_arr['sd3'])
        hold = [*self.re_arr['null'].copy()]
        for i in range(1, len(scores)-1): hold.append((scores[i+1]-scores[i])+(scores[i-1]-scores[i]))

        if afx[0] == '_':
            if len(afx) > over_length: hold.extend([self.re_arr['dlpd3'], self.re_arr['dlpd3']])
            else: hold.extend([self.re_arr['dpd3'], self.re_arr['dpd3']])
        elif afx[-1] == '_':
            if len(afx) > over_length: hold.extend([self.re_arr['dsd3'], self.re_arr['dsd3']])
            else: hold.extend([self.re_arr['dsd3'], self.re_arr['dsd3']])

        out = []
        for i in range(2, len(hold)-3):
            u1, d1, md = hold[i-1].copy(), hold[i+1].copy(), hold[i]
            if self.dbg: print(words[i-2], (md-u1).mean(), (md-d1).mean())
            if (md-u1).mean() > bridge_coeff or (md-d1).mean() > bridge_coeff:
                u2, d2 = hold[i-2].copy(), hold[i+2].copy()
                u2[u2 > u1] *= 0
                u1[u1 > hold[i-2]] *= 0
                d2[d2 > d1] *= 0
                d1[d1 > hold[i+2]] *= 0
                u1 = u1 + u2
                d1 = d1 + d2
            out.append((md-u1)+(md-d1))

        if self.dbg:
            for i, x in enumerate(out): print(words[i], '\n', self.remean(x).mean(), '\n', x)
        return [(words[i], self.remean(x).mean()) for i, x in enumerate(out)]

    def pulld_relent(self, afx: str, depth: int=1) -> np.ndarray:
        #Returns the mean relative entropy of all child nodes of the input affix
        hold = [afx]
        while depth > 0:
            grp = []
            while hold:
                tmp = self.pulld(hold.pop(), 0)
                if tmp:
                    for y in tmp:
                        grp.append(y)
            hold.extend(grp)
            depth -= 1
        if hold:
            hold = np.mean([self.rntp[x] for x in hold], axis=0)
            return np.array([*[np.mean(x) for x in hold], *[np.mean(x) for x in hold.T]])
        else: return np.array([0]*6)

    def graph_relent(self, afx: str):
        words = self.chain(afx)
        _, ax = plt.subplots(figsize=(16, 10))
        yvars = [[*[np.mean(x) for x in self.rntp[w]], *[np.mean(x) for x in self.rntp[w].T], *[np.mean(x) for x in self.drntp[w]], *[np.mean(x) for x in self.drntp[w].T]] for w in words]

        plt.xticks(range(len(yvars)), words)
        ax.plot(range(len(yvars)), yvars)
        ax.legend(['arnd', 'dir', 'exct', 'wnd3', 'wnd2', 'wnd1', 'darnd', 'ddir', 'dexct', 'dwnd3', 'dwnd2', 'dwnd1'])
        plt.show()

    def find_sub_chain(self, word, afxl=None, sub_depth=0):
        if not afxl: afxl = self.verif
        fout = set()
        #Create list with word and all affixes found in word
        targets = [(word, sorted([x for x in afxl if x in word]), [], [])]
        while targets:
            word, found_afxs, rem, rafxs = targets.pop()
            is_match = False
            while found_afxs:
                #For every affix thats found in a word
                afx = found_afxs.pop()
                if len(afx) / len(word) > 0.6: continue
                sub = self.gsub(word, afx)
                if sub:
                    is_match = True
                    rem.append(word)
                    rafxs.append(afx)
                    #If there are still affixes left add a new group to targets
                    #Allows continuation if word reaches an early dead end
                    if found_afxs and any(y in sub for y in found_afxs): targets.append((sub, found_afxs.copy(), rem.copy(), rafxs.copy()))
                    found_afxs = sorted([x for x in afxl if x in sub])
                    word = sub
                #Clean duplicate targets
                for x in targets[::-1]:
                    if word == x[0] and found_afxs == x[1]: targets.remove(x)
                if len(word) < 7: break
            #Once no more affixes are found in a word add it to the outputs if atleast 1 matcheed
            #If word is still long and no match, use double sub
            if is_match: fout.add((tuple(rafxs), (*rem, word)))
            elif len(word) > 7 and dsub:
                dsub = self.g2sub(word, afxl, depth=sub_depth)
                if dsub:
                    for k, ds in enumerate(dsub):
                        dbl_found = sorted([x for x in afxl if x in ds[0]])
                        rcp, rafc = rem.copy(), rafxs.copy()
                        rcp.extend(ds[1])
                        rafc.extend(ds[2])
                        if dbl_found: targets.append((ds[0], dbl_found, rcp, rafc))
                        else: fout.add((tuple(rafc), (*rcp, ds[0])))
        return tuple(fout)

    def g2sub(self, word, afxl=None, depth=1):
        if not afxl: afxl = self.verif
        queue = {(word, x, (), ()) for x in afxl if x in word}
        dupe_key = set()
        out = []
        while depth > 0 and queue:
            newq = set()
            for x in queue:
                step = self.gsub(x[0], x[1], fltr=False)
                if step:
                    for y in step:
                        for z in afxl:
                            if z in y:
                                newq.add((y, z, (*x[2], x[0]), (*x[3], x[1])))
            queue = newq.copy()
            depth -= 1
            for x in queue:
                g = self.gsub(x[0], x[1])
                if g:
                    if (g, tuple(sorted((x[0], x[1], *x[2], *x[3])))) not in dupe_key:
                        out.append((g, (*x[2], x[0]), (*x[3], x[1])))
                        dupe_key.add((g, tuple(sorted((x[0], x[1], *x[2], *x[3])))))
        if out: return out

    def e2gsub(self, word, it_mx=2, bridges=False):
        stg = 0
        bkd = {word: {'afx': [], 'reps': [], 'ub': [], 'chk': False, 'bchk': False}}
        while stg <= it_mx:
            new = []
            for w in bkd:
                if not bkd[w]['chk']:
                    for nx in [x for x in self.verif if x in w]:
                        rep = w.replace(nx, '_')
                        bkd[w]['afx'].append(nx)
                        bkd[w]['reps'].append(rep)
                        if rep not in bkd: new.append((rep, bkd[w]['afx'].copy(), bkd[w]['reps'].copy(), bkd[w]['ub'].copy()))
                    else: bkd[w]['chk'] = True
                elif bridges and not bkd[w]['bchk']:
                    b1 = [*[(w.replace(f'_{b}', '_'), f'_{b}') for b in self.ldct['bridges'] if w.startswith(f'_{b}')], 
                        *[(w.replace(f'{b}_', '_'), f'{b}_') for b in self.ldct['bridges'] if w.endswith(f'{b}_')]]
                    b2 = [b for b in b1 if any(bz in b[0] for bz in self.verif)]
                    for b in b2:
                        if b[0] not in new and b[0] not in bkd:
                            obr = [q for q in bkd[w]['ub']]
                            new.append((b[0], [z for z in bkd[w]['afx']], [z for z in bkd[w]['reps']], [b[1], *obr]))
                    bkd[w]['bchk'] = True
            for nw in new:
                if nw[0] not in bkd:
                    bkd[nw[0]] = {'afx': nw[1], 'reps': nw[2], 'ub': nw[3], 'chk': False, 'bchk': False}
            stg += 1
        bkd.pop(word)
        kl = [(k, 50000) if k in self.roots else (k, self.full_scores[k]) for k in bkd.keys() if k in self.full_scores]
        if kl:
            ok = sorted(kl, key=lambda x: x[1])[-1][0]
            if not bridges: return (ok, bkd[ok]['reps'][:-1], bkd[ok]['afx'][:-1])
            else: return (ok, bkd[ok]['reps'][:-1], bkd[ok]['afx'][:-1], bkd[ok]['ub'])

    def assign_search_dict(self, words: Container):
        self.default_search = words

    def load(self, id: int=0) -> None:
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\{id}', 'rb') as f:
            self.wlst, self.afx, self.cleared, self.final, self.dsts, self.rntp, self.drntp, self.re_arr, self.full_scores = load(f)

    def save(self, id: int=0) -> None:
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\{id}', 'wb') as f:
            dump((self.wlst, self.afx, self.cleared, self.final, self.dsts, self.rntp, self.drntp, self.re_arr, self.full_scores), f)

class Word:
    __slots__ = ('w', 'all', 'core', 'forms', 'mods', 'aliases', 'root', 'plur', 'pres', 'pprg', 'past')
    """
    core: words that use this word as a root
    mods: words that use this word to modify another
    aliases: alternate names that are not extensible. nicknames, acronyms, foreign languages
    forms: conjugations and alternate spellings of the same stem
    """
    def __init__(self, word):
        self.w = word
        self.all = (word,)
        self.core = ()
        self.mods = ()
        self.forms = ()
        self.aliases = ()
        self.root = False
        self.plur = None
        self.pres = None
        self.pprg = None
        self.past = None

    def __len__(self):
        return len(self.all)

    def __repr__(self):
        if len(self.all) == 1: return self.w
        else: return f'{self.all[::-1]}'

    def __str__(self):
        if len(self.all) == 1: return self.w
        else:
            ostr = f'{self.all[::-1]}'
            for x in [x for x in self.__slots__[2:] if getattr(self, x)]:
                if x in ('plur', 'pres', 'pprg', 'past'): ostr += f'\n{"    " * (1+wbs.bt[self.w].index(self.w))} {x}: {getattr(self, x)}'
                else: ostr += f',  {x}: {getattr(self, x)}'
            return ostr

    def __getitem__(self, k):
        return getattr(self, k)

    def __setitem__(self, k, v):
        setattr(self, k, v)

    def adg(self, k, v):
        if v:
            attr = getattr(self, k)
            if isinstance(v, (str, Word)):
                if isinstance(attr, (str, Word)): setattr(self, k, (v, attr))
                elif not attr or len(attr) == 0: setattr(self, k, (v,))
                else: setattr(self, k, (v, *attr))
            else:
                if isinstance(attr, (str, Word)): setattr(self, k, (*v, attr))
                elif not attr or len(attr) == 0: setattr(self, k, v)
                else: setattr(self, k, (*v, *attr))

class Lexicon:
    #('', ''), ('', ''), 
    dbl = ('b', 'c', 'd', 'f', 'g', 'l', 'm', 'n', 'p', 'r', 's', 't', 'z')
    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\conjunctions', 'rt') as f:
        cnj = [x.strip() for x in f.readlines()]
    with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pronouns', 'rt') as f:
        prn = [x.strip() for x in f.readlines()]
    pos = 'nvadreci'

    def __len__(self):
        return len(self.bases)

    def __getitem__(self, k):
        if k in self.bases: return self.bases[k]
        elif k in self.bt: return self.bases[self.bt[k][0]]
        else: raise KeyError

    def __setitem__(self, k, v):
        if k not in self.bases: 
            self.bases[k] = Word(v)
            self.bt[k] = None

    def __repr__(self):
        return self.bases.__repr__()

    def __init__(self, wlst):
        self.bt = {x: None for x in wlst}
        self.bases = {x: Word(x) for x in wlst}
        self.dbv = ['af', 'ag', 'ap', 'ar', 'as', 'at', 'ef', 'ir', 'il', 'oc', 'of', 'op', 'suc', 'suf', 'sug', 'sup', 'sum', 'syl']
        self.cut = ['absc', 'abst', 'ed', 'eg', 'el', 'em', 'ev', 'er', 'ep', 'emb', 'emp', 'ell', 'imb', 'imm', 'imp', 'illu', 'sys', 'sus']

        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\affixes', 'rt') as f:
            self.mpx = [x.strip() for x in f.readlines()]
        self.affixes = {x: Word(x) for x in self.mpx}
        self.mpx = sorted([x.strip('_') for x in self.mpx if x[0] == '_'], key=lambda x: len(x))[::-1]
        self.xmpx = {x: [z for z in self.mpx if z.startswith(x) and z != x] for x in self.mpx}

        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pre_ex', 'rt') as f:
            for x in f.readlines():
                x = x.strip().split()
                for y in x[1:]:
                    self.xmpx[x[0]].append(y)

        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\afx_groups', 'rt') as f:
            for x in f.readlines():
                x = x.strip().split()
                for y in x[1:]:
                    self.merge_affix(x[0], y)

        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\alt_rpls', 'rt') as f:
            for x in f.readlines():
                x = x.strip().split()
                self.update(x[0], x[1], cat='aliases')


    def update(self, sink: str, src: str, afx='', cat='', remove=True):
        if sink not in self.bases: sink = self.__getitem__(sink).w
        self.bases[sink].adg('all', self.bases[src].all)

        if cat == 'aliases':
            self.bases[sink].adg('aliases', self.bases[src].all)
        elif cat == 'Irg':
            self.bases[sink].adg(afx, src)
            self.bases[sink].adg('all', src)
            return
        else:
            if afx and afx != 'Irg':
                if isinstance(afx, (list, tuple)):
                    for x in afx:
                        if x == 'Irg': continue
                        if '_' in x:
                            self.affixes[x].adg('mods', src)
                            self.affixes[x].adg('all', src)
                        else:
                            self.bases[x].adg('mods', src)
                            self.bases[x].adg('all', src)
                else:
                    if '_' in afx:
                        self.affixes[afx].adg('mods', src)
                        self.affixes[afx].adg('all', src)
                    else:
                        self.bases[afx].adg('mods', src)
                        self.bases[afx].adg('all', src)
                    
            self.bases[sink].adg('core', self.bases[src].core)
            self.bases[sink].adg('mods', self.bases[src].mods)
            self.bases[sink].adg('forms', self.bases[src].forms)
            self.bases[sink].adg('aliases', self.bases[src].aliases)

            if cat:
                if cat in ('plur', 'pres', 'pprg', 'past'):
                    if self.bases[sink][cat]:
                        if len(self.bases[src]) == 0: self.bases[sink].adg(cat, src)
                        else: self.bases[sink].adg(cat, self.bases[src])
                    else:
                        if len(self.bases[src]) == 0: self.bases[sink][cat] = src
                        else: self.bases[sink][cat] = self.bases[src]
                else: self.bases[sink].adg(cat, src)

        if remove and src in self.bases: self.bases.pop(src)
        if cat == 'aliases': self.bt.pop(src)
        else: self.update_track(sink, src)

    def update_track(self, sink, src):
        if self.bt[sink]: pack = [*self.bt[sink]]
        else: pack = [sink]
        if self.bt[src]: pack.extend([x for x in self.bt[src] if x not in pack])
        else: pack.append(src)
        for x in pack: self.bt[x] = tuple(pack)

    def rcsfx(self, word, al, r='', t='d', vd='', pref=False) -> bool | str:
        if len(word) < al+2: return False
        if vd:
            if isinstance(vd, tuple):
                if word[vd[0]:(vd[0]+len(vd[1]) if vd[0]+len(vd[1]) < 0 else None)] != vd[1]: return False
            elif isinstance(vd, dict):
                for k in vd:
                    if word[-(al+k)] not in vd[k]: return False

        if len(word) > 5 and pref:
            pword = self.rcpfx(word)
            if pword: words = [(word, ''), pword]
            else: words = [(word, '')]
        else: words = [word]

        rls = []
        for wx in words:
            if pref:
                if isinstance(wx, tuple):
                    wx, pre = wx
                else: pre = ''
            if t == 'd':
                rep = f'{wx[:-al]}{r}'
            elif t == 'dbl':
                if len(wx) < (al+3): continue
                if wx[-(al+1)] != wx[-(al+2)]: continue
                if not vd: vd = {1: 'bcdfglmnprstz'}
                elif isinstance(vd, tuple):
                    if wx[vd[0]:(vd[0]+len(vd[1]) if vd[0]+len(vd[1]) < 0 else None)] != vd[1]: continue
                elif isinstance(vd, dict):
                    for k in vd:
                        if wx[-(al+k)] not in vd[k]: continue
                rep = f'{wx[:-(al+1)]}{r}'
            elif t == 'iy':
                if wx[-al] != 'i': continue
                rep = f'{wx[:-al]}y'
            elif t == 'm':
                for i in range(1, al+1):
                    if wx[:-i] not in self.bases and wx[:-i] not in self.bt: continue
                    rep = wx[:-i]
                    break
                else: continue

            if len(rep) > 2 and rep in self.bases or (rep in self.bt and self.bt[rep]): return (rep, pre) if pref else rep
            elif pref and len(rep) > 4: rls.append((rep, pre))

        if pref:
            for x in rls:
                rep = self.rcpfx(x[0])
                if rep and len(rep[0]) > 3: return (rep[0], x[1], rep[1])

    def rcpfx(self, word) -> str:
        mtchs = sorted([
            y for y in self.mpx\
            if word.startswith(y)\
            and len(word) - len(y) > 2\
            and all(not word.startswith(z) for z in self.xmpx[y])\
            and (True if ((y in self.dbv and word[len(y)] == y[-1]) or y not in self.dbv) else False)], key=lambda x: len(x))[::-1]
        for x in mtchs:
            if x in self.cut: rep = word[len(x)-1:]
            else: rep = word[len(x):]
            if rep in self.bases or (rep in self.bt and self.bt[rep]): return (rep, f'_{x}')
            elif x in ('ex'):
                rep = f's{rep}'
                if rep in self.bases or (rep in self.bt and self.bt[rep]): return (rep, f'_{x}')

    def rform(self, word, form):
        if form in self.bases or form in self.bt:
            self.update(word, form, cat='forms')
        else:
            self.bases[word].adg('forms', form)
            self.bases[word].adg('all', form)
            if self.bt[word]: pack = [*self.bt[word], form]
            else: pack = [word, form]
            for x in pack: self.bt[x] = tuple(pack)

    def merge_affix(self, sink, src):
        self.affixes[sink].adg('all', self.affixes[src].all)
        self.affixes[sink].adg('core', self.affixes[src].core)
        self.affixes[sink].adg('mods', self.affixes[src].mods)
        self.affixes[sink].adg('forms', self.affixes[src].forms)
        self.affixes[sink].adg('aliases', self.affixes[src].aliases)
        self.affixes[sink].adg('forms', src)
        self.affixes[src] = self.affixes[sink]




    def homo_parse(self):
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\homos', 'rt') as f:
            homos = [x.strip().split() for x in f.readlines()]
        for x in homos:
            if x[0] == 'singular':
                self.update(x[2], x[3], x[1], 'Irg')
                if len(x) > 4:
                    for y in x[4:]:
                        self.update(x[2], y, x[1], 'Irg')
            elif x[0] == 'mod':
                for z in range(2, len(x), 2):
                    self.update(x[1], x[z+1], x[z], 'past')

    def irg_parse(self):
        #('', ''), 
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\irg_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        #Wind Wound
        for x in rpls:
            for y in x[1:]:
                self.update(x[0], y, 'Irg', 'past')

    def unq_parse(self, pref=False):
        #ed
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], cat='core')
        fls = ''.split()

        al = 2
        for x in [x for x in self.bases if len(x) > 3+al and x.endswith('') and (y not in x for y in fls) and x not in igls]:
            mx = False
            
            if x.endswith('') and (mx := self.rcsfx(x, al, pref=pref)): pass
            elif x.endswith('') and (mx := self.rcsfx(x, al, pref=pref)): pass

            if mx: self.update(self.__getitem__(mx), x, x[-2:], 'core')
                
        print(f'{sl - len(self.bases)} items combined for plurals\n{len(self.bases)} remaining')

    def pre_parse(self, pref=False):
        #ed
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pre_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pre_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2], cat='core')
        fls = ''.split()

        for x in [x for x in self.bases if any(x.startswith(y) for y in self.mpx)]:
            mx = self.rcpfx(x)
            if mx and len(mx[0]) > 3:
                self.update(self.__getitem__(mx[0]).w, x, mx[1], 'core')
        print(f'{sl - len(self.bases)} items combined for prefixes\n{len(self.bases)} remaining')




    def pl_parse(self, pref=False):
        #s, ia, a
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pl_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pl_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2], 'plur')
        fls = 'series species'.split()

        for x in [x for x in self.bases if len(x) > 3 and x.endswith('s') and (y not in x for y in fls) and x not in igls]:
            mx = False
            
            if x.endswith('ies') and (mx := self.rcsfx(x, 3, t='iy', pref=pref)): afx = 'es_'
            elif x.endswith('s') and x[-2] != 's' and (mx := self.rcsfx(x, 1, pref=pref)): afx = 's_'
            elif x.endswith('es') and (mx := self.rcsfx(x, 2, vd={1: 'shoxz'}, pref=pref)): afx = 'es_'
            elif x.endswith('es') and (mx := self.rcsfx(x, 2, t='dbl', pref=pref)): afx = 'es_'
            elif x.endswith('ves') and (mx := self.rcsfx(x, 3, r='f', pref=pref)): afx = 'ves_'
            elif x.endswith('ices') and ((mx := self.rcsfx(x, 4, r='ix', pref=pref)) or (mx := self.rcsfx(x, 4, r='ex', pref=pref))): afx = 'ices_'
            elif x.endswith('es') and (mx := self.rcsfx(x, 2, r='is', pref=pref)): afx = 'es_'

            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'plur')
                else: self.update(self.__getitem__(mx).w, x, afx, 'plur')

        for x in [x for x in self.bases if len(x) > 3 and (x[-1] in 'aeix') and x not in igls]:
            mx = False

            if x.endswith('ia') and (mx := self.rcsfx(x, 1, r='um', pref=pref)): afx = 'Irg'
            elif x.endswith('i') and (mx := self.rcsfx(x, 1, r='us', pref=pref)): afx = 'Irg'
            elif x.endswith('eaux') and (mx := self.rcsfx(x, 1, pref=pref)): afx = 'Irg'
            elif x.endswith('a') and (mx := self.rcsfx(x, 1, r='on', pref=pref)): afx = 'Irg'
            elif x.endswith('ae') and (mx := self.rcsfx(x, 1, pref=pref)): afx = 'Irg'

            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'plur')
                else: self.update(self.__getitem__(mx).w, x, afx, 'plur')

        print(f'{sl - len(self.bases)} items combined for plurals\n{len(self.bases)} remaining')

    def prpt_parse(self, pref=False):
        #ing
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\prpt_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\prpt_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2], 'pprg')
        
        for x in [x for x in self.bases if len(x) > 5 and x.endswith('ing') and x not in igls]:
            mx = False

            if (mx := self.rcsfx(x, 3, t='dbl', pref=pref)): pass
            elif (mx := self.rcsfx(x, 3, r='e', pref=pref)): pass
            elif (mx := self.rcsfx(x, 3, pref=pref)): pass
            elif (mx := self.rcsfx(x, 4, vd={0: 'k', 1: 'c'}, pref=pref)): pass
            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, ('ing_', *[z for z in mx[1:] if z]), 'pprg')
                else: self.update(self.__getitem__(mx).w, x, 'ing_', 'pprg')

        print(f'{sl - len(self.bases)} items combined for present progressives\n{len(self.bases)} remaining')

    def pt_parse(self, pref=False):
        #ed
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pt_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\pt_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2], 'past')
        
        rmls = []
        for x in [x for x in self.bases if len(x) > 4 and x.endswith('ed') and x not in igls]:
            mx = False

            if x.endswith('ied') and (mx := self.rcsfx(x, 3, t='iy', pref=pref)): afx = 'ed_'
            elif (mx := self.rcsfx(x, 1, pref=pref)): afx = 'ed_'
            elif (mx := self.rcsfx(x, 2, pref=pref)): afx = 'ed_'
            elif (mx := self.rcsfx(x, 2, t='dbl', pref=pref)): afx = 'ed_'
            elif (mx := self.rcsfx(x, 3, vd=(-4, 'ck'), pref=pref)): afx = 'ed_'
            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'past')
                else: self.update(self.__getitem__(mx).w, x, afx, 'past')
        print(f'{sl - len(self.bases)} items combined for past \n{len(self.bases)} remaining')

    def adjv_parse(self, pref=False):
        #er/est | y
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\adj_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\adj_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        fls = 'logy berry play copy body away shy fly day'.split()
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2], 'core')

            sgrp = [(x, f'{x[:-2]}r') for x in [x for x in self.bases if x.endswith('est')] if f'{x[:-2]}r' in self.bases and x not in igls]
            for x in sgrp:
                if x[0][-4] == 'i': al = 4
                else: al = 3
                pack = []
                for k, y in enumerate(x):
                    if (mx := self.rcsfx(y, al-k, pref=pref)): pack.append(mx)
                    if (mx := self.rcsfx(y, al-k, r='e', pref=pref)): pack.append(mx)
                    if (mx := self.rcsfx(y, al-k, t='iy', pref=pref)): pack.append(mx)
                    if (mx := self.rcsfx(y, al-k, t='dbl', pref=pref)): pack.append(mx)
                
                if pack:
                    pack = tuple(set([z for z in pack if pack.count(z) > 1 and z not in igls]))
                    if pack:
                        if len(pack) > 1:
                            if pack[0][-1] == 'y':
                                rt = pack[1]
                                self.update(rt, pack[0], 'y_', 'core')
                            else:
                                rt = pack[0]
                                self.update(rt, pack[1], 'y_', 'core')
                        else: rt = pack[0]
                        self.update(rt, x[0], ('est_' if x[0].endswith('est') else 'er_'), 'core')
                        self.update(rt, x[1], ('est_' if x[1].endswith('est') else 'er_'), 'core')

        for x in [x for x in self.bases if len(x) > 4 and x.endswith('y') and x not in igls and all(not x.endswith(y) for y in fls)]:
            mx = False
            afx = f'{x[-2:]}_'

            if x.endswith('ly') and (mx := self.rcsfx(x, 2, vd={1: 'bcdefghklmnprstwxy'}, pref=pref)): pass
            elif x.endswith('ry') and (mx := self.rcsfx(x, 2, vd={1: 'cdeklnt'}, pref=pref)): pass
            elif x.endswith('ty') and (mx := self.rcsfx(x, 2, vd={1: 'elx'}, pref=pref)): pass
            elif x.endswith('bility') and (mx := self.rcsfx(x, 5, r='le', pref=pref)): pass
            elif x.endswith('cy') and (mx := self.rcsfx(x, 2, r='te', vd={1: 'a'}, pref=pref)): pass
            elif x.endswith('cy') and (mx := self.rcsfx(x, 2, r='t', vd={1: 'n'}, pref=pref)): pass
            elif (mx := self.rcsfx(x, 1, vd={1: 'dfghklmnprstwxz'}, pref=pref)): afx = 'y_'
            elif any(x.endswith(y) for y in ('ily', 'ary', 'ory', 'ity', 'ify')) and (mx := self.rcsfx(x, 3, r='e', vd={1: 'bcdgklmnprstvz'}, pref=pref)): pass
            elif (mx := self.rcsfx(x, 1, r='e', vd={1: 'bcdgklmnprstvz'}, pref=pref)): afx = 'y_'
            elif (mx := self.rcsfx(x, 1, t='dbl', vd={1: 'bdglmnpt'}, pref=pref)): afx = 'y_'
            elif (mx := self.rcsfx(x, 2, vd={0: 'r', 1: 'r', 2: 'u'}, pref=pref)): pass
            elif x.endswith('ically') and (mx := self.rcsfx(x, 4, pref=pref)): pass
            elif (x.endswith('arily') or x.endswith('sily')) and (mx := self.rcsfx(x, 3, r='y', pref=pref)): pass
            elif x.endswith('llary') and (mx := self.rcsfx(x, 4, pref=pref)): pass
            elif x.endswith('ary') and (mx := self.rcsfx(x, 3, vd={1: 'bdmnrt'}, pref=pref)): pass
            elif x.endswith('ily') and (mx := self.rcsfx(x, 3, vd={1: 'dhkmpt'}, pref=pref)): pass
            elif x.endswith('ity') and (mx := self.rcsfx(x, 3, vd={1: 'cdelmnrtx'}, pref=pref)): pass
            elif x.endswith('ory') and (mx := self.rcsfx(x, 3, vd={1: 'st'}, pref=pref)): pass
            elif x.endswith('ily') and (mx := self.rcsfx(x, 3, t='dbl', vd={1: 'ndp'}, pref=pref)): pass
            elif x.endswith('ity') and (mx := self.rcsfx(x, 3, t='dbl', vd={1: 'lp'}, pref=pref)): pass
            elif x.endswith('ery') and (mx := self.rcsfx(x, 3, t='dbl', vd={1: 'bgln'}, pref=pref)): pass

            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'core')
                else: self.update(self.__getitem__(mx).w, x, afx, 'core')
        print(f'{sl - len(self.bases)} items combined for adjectives \n{len(self.bases)} remaining')

    def mbr_parse(self, pref=False):
        #ist/ian, er/or/ee
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\mbr_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\mbr_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        fls = 'meter water power flower polar'.split()
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2], 'core')

        al = 3
        for x in [x for x in self.bases if len(x) > 5 and (x.endswith('ian') or x.endswith('ist')) and x not in igls]:
            mx = False

            if (mx := self.rcsfx(x, 1, r='m', pref=pref)): afx = 'ist_'
            elif (mx := self.rcsfx(x, al, r='y', pref=pref)): afx = f'{x[-3:]}_'
            elif x.endswith('scientist') and (mx := self.rcsfx(x, len('scientist'), r='science', pref=pref)): afx = 'ist_'
            elif x.endswith('tarian') and (mx := self.rcsfx(x, 5, r='y', pref=pref)): afx = 'ian_'
            elif x.endswith('ician') and (mx := self.rcsfx(x, 3, r='s', pref=pref)): afx = 'cian_'
            elif x.endswith('ician') and (mx := self.rcsfx(x, 5, pref=pref)): afx = 'cian_'
            elif x.endswith('ian') and (mx := self.rcsfx(x, al, t='m', pref=pref)): afx = 'ian_'
            elif (mx := self.rcsfx(x, al, pref=pref)): afx = f'{x[-3:]}_'
            elif (mx := self.rcsfx(x, al, r='e', pref=pref)): afx = f'{x[-3:]}_'
            elif (mx := self.rcsfx(x, al, t='dbl', pref=pref)): afx = f'{x[-3:]}_'
            elif (mx := self.rcsfx(x, al, r='ic', pref=pref)): afx = f'{x[-3:]}_'

            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'core')
                else: self.update(self.__getitem__(mx).w, x, afx, 'core')

        for x in [x for x in self.bases if len(x) > 4 and x[-2:] in ('er', 'ee', 'or') and x not in igls and all(y not in x for y in fls)]:
            #and x[-3] in 'stlr'
            if x.endswith('ster'): al = 4
            elif x.endswith('ier'): al = 3
            else: al = 2
            mx = False
            afx = f'{x[-2:]}_'

            if (mx := self.rcsfx(x, al, t='m', pref=pref)): pass
            elif (mx := self.rcsfx(x, al, r='e', pref=pref)): pass
            elif x.endswith('ier') and (mx := self.rcsfx(x, al+1, t='iy', pref=pref)): pass
            elif x.endswith('ier') and (mx := self.rcsfx(x, al+1, r='e', pref=pref)): pass
            elif x.endswith('eer') and (mx := self.rcsfx(x, al+1, pref=pref)): pass
            elif (mx := self.rcsfx(x, al, t='dbl', pref=pref)): pass
            elif x.endswith('ster') and (mx := self.rcsfx(x, al+2, pref=pref)): afx = 'ster_'

            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'core')
                else: self.update(self.__getitem__(mx).w, x, afx, 'core')
                
        print(f'{sl - len(self.bases)} items combined for membership \n{len(self.bases)} remaining')

    def sfx1_parse(self, pref=False):
        sl = len(self.bases)
        rpls = [('use', 'usable'), ('note', 'notable'), ('ride', 'ridable'), ('erase', 'erasable'), ('tend', 'tenable')]
        igls = 'parable liable capable arable sister'.split()
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], 'able', 'core')

        al = 4
        for x in [x for x in self.bases if len(x) > 4 and x not in igls and any(x.endswith(t) for t in ('less', 'ness', 'able', 'ible'))]:
            mx = False

            if (mx := self.rcsfx(x, al, pref=pref)): pass
            elif (x.endswith('able') or x.endswith('ible')) and (mx := self.rcsfx(x, al, r='e', pref=pref)): pass
            elif x[-5] == 'i' and (mx := self.rcsfx(x, al+1, t='iy', pref=pref)): pass

            if mx:
                if x.endswith('ness'): afx = 'ness_'
                elif x.endswith('ible'): afx = 'able'
                else: afx =x[-4:]
                if pref: self.update(self.__getitem__(mx[0]).w, x, (afx, *[z for z in mx[1:] if z]), 'core')
                else: self.update(self.__getitem__(mx).w, x, afx, 'core')

        for tgt in (('woman', 'women'), ('man', 'men')):
            al = len(tgt[0])
            for x in [x for x in self.bases if len(x) > 5 and x not in igls and (x.endswith(tgt[0]) or x.endswith(tgt[1]))]:
                mx = False

                if (mx := self.rcsfx(x, al, pref=pref)): pass
                if mx:
                    if pref: self.update(tgt[0], x, (self.__getitem__(mx[0]).w, *[z for z in mx[1:] if z]), 'core')
                    else: self.update(tgt[0], x, self.__getitem__(mx).w, 'core')
        print(f'{sl - len(self.bases)} items combined for sfx1 \n{len(self.bases)} remaining')

    def v2d_parse(self, pref=False):
        sl = len(self.bases)
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\v2d_rpls', 'rt') as f:
            rpls = [x.strip().split() for x in f.readlines()]
        with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\v2d_igls', 'rt') as f:
            igls = [x.strip() for x in f.readlines()]
        if not pref:
            for x in rpls:
                self.update(x[0], x[1], x[2:], 'core')
        fls = 'active drive ceptive ceive dive hive'.split()

        al = 3
        for x in [x for x in self.bases if len(x) > 3+al and x.endswith('ive') and (y not in x for y in fls) and x not in igls]:
            mx = False

            if (mx := self.rcsfx(x, al, pref=pref)): pass
            elif (mx := self.rcsfx(x, al, pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+1, r='de', vd={0: 's', 1: 'aeiou'}, pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+2, r='e', vd=(-5, 'it'), pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+2, vd=(-5, 'it'), pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+1, r='d', vd={0: 'sn'}, pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al, pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al, r='e', pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+2, r='e', vd=(-5, 'at'), pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+2, vd=(-5, 'at'), pref=pref)): pass
            elif (mx := wbs.rcsfx(x, al+2, r='y', vd=(-5, 'at'), pref=pref)): pass

            if mx:
                if pref: self.update(self.__getitem__(mx[0]).w, x, ('ive_', *[z for z in mx[1:] if z]), 'core')
                else: self.update(self.__getitem__(mx).w, x, 'ive_', 'core')
        print(f'{sl - len(self.bases)} items combined for plurals\n{len(self.bases)} remaining')



with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\tests', 'rt') as f:
    tests = [x.strip() for x in f.readlines()]

a, roots, nfx = setup()
a.assign_search_dict(a.bare)
wbs = Lexicon(set([x.strip('_') for x in usk_rep(a.wlst)]))



In [None]:

wbs.homo_parse()
wbs.irg_parse()

wbs.pl_parse()
wbs.prpt_parse()
wbs.pt_parse()
wbs.adjv_parse()
wbs.mbr_parse()
wbs.sfx1_parse()

wbs.pl_parse(pref=True)
wbs.prpt_parse(pref=True)
wbs.pt_parse(pref=True)
wbs.adjv_parse(pref=True)
wbs.mbr_parse(pref=True)
wbs.sfx1_parse(pref=True)

wbs.pre_parse()


In [None]:
[x for x in wbs.bases if len(x) > 4 and x[-2:] in ('er', 'ee', 'or') and x[-3] in 'stlr']

In [33]:
with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\base1', 'wb') as f:
    dump(wbs, f)

In [32]:
ppres = sorted(lrsort(roots, trim=True), key=lambda x: len(x))[::-1]
pre = []

while ppres:
    x = ppres.pop()
    tmp = [x]
    for y in [z for z in ppres if z.startswith(x) and x != z]:
        ppres.remove(y)
        tmp.append(y)
    pre.append(tmp)
    
afgen = (x for x in pre)

In [None]:
with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\base1', 'rb') as f:
    wbs = load(f)

In [34]:
with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\root groups', 'rt') as f:
    root_groups = (x.strip().split() for x in f.readlines())


In [None]:
sword = 'bine'
wbs.update_bases('bin', sword, 'root', 'form')
if sword in wbs.bases: wbs.bases.pop(sword)

wbs.rform('ball', 'bol')
group.append('category')
for x in ['categor']:
    group.remove(x)
    if x in wbs.bt: print(x)

In [None]:

for x in group:
    if x in wbs.bases: wbs.bases[x].root = True
    elif x in wbs.bt: wbs.bases[wbs.bt[x][0]].root = True
    else:
        wbs.bases[x] = Word(x)
        wbs.bases[x].root = True


In [None]:
center = 'able'
group = [x for x in group if x != center]

for x in group:
    

In [None]:
group = next(root_groups)
cprint(group, [1, 3, 5], hlite=True)
tgroup = [x[:-1] if x[-1] in 'aeiou' else x for x in group]
c1 = [[y for y in wbs.bases if x in y and all(z not in y for z in tgroup if z != x and len(z) >= len(x))] for x in tgroup]

cprint(tgroup, [1, 3, 5], hlite=True)
cprint(c1, [1, 3, 5], hlite=group, hl_col=True)

In [268]:
hold = [(x, wbs[x].forms, wbs[x].mod) for x in wbs.bases if wbs[x].root]

In [None]:
hold = [(x, wbs[x].forms, wbs[x].mod) for x in wbs.bases if wbs[x].root]

for x in hold:
    if x[0] in wbs.bases:
        wbs.bases[x[0]].root = True
    elif x[0] in wbs.bt:
        wbs.bases[wbs.bt[x[0]][0]].root = True
    else:
        wbs.add_word(x[0], root=True)
    if x[1]:
        for y in x[1]:
            if y in wbs.bt:
                wbs.update_bases(x[0], (y.w if isinstance(y, Word) else y), 'root', 'forms')
                wbs.bases.pop((y.w if isinstance(y, Word) else y))
            else:
                wbs.rform(x[0], (y.w if isinstance(y, Word) else y))
    if x[2]:
        for y in x[2]:
            wbs.update_bases(x[0], y.w, 'root', 'mod')
            wbs.bases.pop(y.w)

In [None]:
sterms = ['ive']
igls = ()
al = 3
c0 = []
c1 = [x for x in wbs.bases if len(x) > 6 and any(x.endswith(y) for y in sterms) and all(y not in x for y in igls)]
c1 = [x.strip() for x in rrsort(c1)]
cgrp = [list() for _ in range(9)]

for x in c1:
    tests1 = [
        (mx := wbs.rcsfx(x, al+1, r='de', vd={0: 's', 1: 'aeiou'})),
        (mx := wbs.rcsfx(x, al+2, r='e', vd=(-5, 'it'))),
        (mx := wbs.rcsfx(x, al+2, vd=(-5, 'it'))),
        (mx := wbs.rcsfx(x, al+1, r='d', vd={0: 'sn'})),
        (mx := wbs.rcsfx(x, al)),
        (mx := wbs.rcsfx(x, al, r='e')),
        (mx := wbs.rcsfx(x, al+2, r='e', vd=(-5, 'at'))),
        (mx := wbs.rcsfx(x, al+2, vd=(-5, 'at'))),
        (mx := wbs.rcsfx(x, al+2, r='y', vd=(-5, 'at'))), 
    ]
    for i, y in enumerate(tests1):
        if y:
            c0.append('-')
            cgrp[i].append(y)
            for k in range(len(cgrp)):
                if k != i:
                    cgrp[k].append('_')
            break
    else:
        tests2 = [
            (mx := wbs.rcsfx(x, al+1, r='de', vd={0: 's', 1: 'aeiou'}, pref=True)),
            (mx := wbs.rcsfx(x, al+2, r='e', vd=(-5, 'it'), pref=True)),
            (mx := wbs.rcsfx(x, al+2, vd=(-5, 'it'), pref=True)),
            (mx := wbs.rcsfx(x, al+1, r='d', vd={0: 'sn'}, pref=True)),
            (mx := wbs.rcsfx(x, al, pref=True)),
            (mx := wbs.rcsfx(x, al, r='e', pref=True)),
            (mx := wbs.rcsfx(x, al+2, r='e', vd=(-5, 'at'), pref=True)),
            (mx := wbs.rcsfx(x, al+2, vd=(-5, 'at'), pref=True)),
            (mx := wbs.rcsfx(x, al+2, r='y', vd=(-5, 'at'), pref=True)), 
        ]
        for i, y in enumerate(tests2):
            if y:
                c0.append('-')
                cgrp[i].append(y)
                for k in range(len(cgrp)):
                    if k != i:
                        cgrp[k].append('_')
                break
        else:
            c0.append('X')
            for i in range(len(cgrp)): cgrp[i].append('_')

print(f'\t\t\t\t\t{len([x for x in c0 if x == "X"])} / {len(c1)}')
cprint([len(c1) - x.count('_') for x in cgrp], [10, 14], col_width=4, halign='r', hlite=True)
cprint(
    [c0, c1, *cgrp],
    [1, 1, 10, 14], 
    halign={0: 'r', 1: 'l', 2: 'r'},
    col_width=4,
    hlite=['X', 'ative', 'ade', 'sive'],
    hl_col=True
)


In [7]:
al = 5
c1 = [x for x in wbs.bases if len(x) > 6 and (x.endswith('ess') and x[-4] not in 'ln')]
c1 = [x.strip() for x in rrsort(c1)]

In [None]:
c0 = {i: 0 for i, _ in enumerate(c1)}
for i, x in enumerate(c1):
    for y in cgrp:
        if y[i] != '_': c0[i] += 1
c0 = ['X' if x == 0 else '-' for x in c0.values()]

print(f'\t\t\t\t\t{len([x for x in c0 if x == "X"])} / {len(c1)}')
#cprint(['ade rep', '-1e rep', '-1 drop', 'ss-t rep', 'd rep', 't rep', 've rep', 'el rep', 'drop'], [10, 14, 18, 22, 26, 30, 34, 38, 42, 50, 54], col_width=4, halign='r', hlite=True)
cprint([len(c1) - x.count('_') for x in cgrp], [10, 14], col_width=4, halign='r', hlite=True)

cprint(
    [c0, c1, *cgrp],
    [1, 1, 10, 14], 
    halign={0: 'r', 1: 'l', 2: 'r'},
    col_width=4,
    hlite=['ess', 'ress', '_'],
    hl_col=True
)


In [None]:
sterms = ['able', 'ible']
igls = ()
al = 4
c0 = []
c1 = [x for x in wbs.bases if len(x) > 6 and any(x.endswith(y) for y in sterms) and all(y not in x for y in igls)]
c1 = [x.strip() for x in rrsort(c1)]
cgrp = [list() for _ in range(9)]



for x in c1:
    tests1 = [
        (mx := wbs.rcsfx(x, al)),
        (x.endswith('able') and wbs.rcsfx(x, al, r='e')),
        (x[-5] == 'i' and wbs.rcsfx(x, al+1, t='iy')) 
    ]
    for i, y in enumerate(tests1):
        if y:
            c0.append('-')
            cgrp[i].append(y)
            for k in range(len(cgrp)):
                if k != i:
                    cgrp[k].append('_')
            break
    else:
        tests2 = [
            (mx := wbs.rcsfx(x, al)),
            (x.endswith('able') and wbs.rcsfx(x, al, r='e')),
            (x[-5] == 'i' and wbs.rcsfx(x, al+1, t='iy')) 
        ]
        for i, y in enumerate(tests2):
            if y:
                c0.append('-')
                cgrp[i].append(y)
                for k in range(len(cgrp)):
                    if k != i:
                        cgrp[k].append('_')
                break
        else:
            c0.append('X')
            for i in range(len(cgrp)): cgrp[i].append('_')

print(f'\t\t\t\t\t{len([x for x in c0 if x == "X"])} / {len(c1)}')
cprint([len(c1) - x.count('_') for x in cgrp], [10, 14], col_width=4, halign='r', hlite=True)
cprint(
    [c0, c1, *cgrp],
    [1, 1, 10, 14], 
    halign={0: 'r', 1: 'l', 2: 'r'},
    col_width=4,
    hlite=['X', 'ative', 'ade', 'sive'],
    hl_col=True
)



In [None]:
ppres = sorted(wbs.mpx, key=lambda x: len(x))[::-1]
pre = []

while ppres:
    x = ppres.pop()
    tmp = [x]
    for y in [z for z in ppres if z.startswith(x) and x != z]:
        ppres.remove(y)
        tmp.append(y)
    pre.append(tmp)
    
afgen = (x for x in pre)

group = next(afgen)
hlls, columns, positions, labels, aldct = ['X'], [], [], [], {}
pix, aix = 1, 0

if len(group) > 1:
    pack = [[z.strip() for z in lrsort([y for y in a.bare if y.startswith(x) and all(z not in y for z in group if z != x and len(z) >= len(x))])] for x in group]
    hlls.extend(group)
else:
    pack = [[z.strip() for z in lrsort([x for x in a.bare if x.startswith(group[0])])]]
    hlls.append(group[0])
for i, p in enumerate(pack):
    reps, tally = [], []
    for x in p:
        r = wbs.rcpfx(x)
        if r:
            reps.append(r)
            tally.append('-')
        else:
            reps.append('_')
            tally.append('X')

    columns.extend([tally, p, reps])
    positions.extend([pix, pix+1, pix+10])
    aldct[aix], aldct[aix+1], aldct[aix+2] = 'l', 'l', 'r'
    pix += 10
    aix += 3
    labels.append(f'{hlls[1+i]}   {len([z for z in tally if z == "-"])}/{len(p)}')

cprint(labels, [positions[i] for i in range(0, len(positions), 3)], halign='l', col_width=4, hlite=True)
cprint(columns, positions, halign=aldct, col_width=4, hlite=hlls)

pf = 'irr'
lrsort([x for x in a.bare if x.startswith(pf[:-1]) and not x.startswith(pf)])

In [47]:
bls = [f'_{z}_' for z in wbs.bases]
als = {x: tuple([y for y in bls if x in y]) for x in nfx}
fcnt = Counter({x[0]: len(x[1]) for x in als.items()})
dupe = {x: [y for y in nfx if x in y and x != y] for x in nfx if len([y for y in nfx if x in y and x != y]) > 0}
for x in dupe.items():
    for y in x[1]:
        fcnt[x[0]] -= fcnt[y]

sx = [x.strip() for x in rrsort([x for x in nfx if x.endswith('_')])]
px = [x.strip() for x in lrsort([x for x in nfx if x.startswith('_')])]
als = {x[0]: (tuple([y for y in x[1] if all(z not in y for z in dupe[x[0]])]) if x[0] in dupe else x[1]) for x in als.items()}
rgroups = {x: sorted([y for y in roots if x in y and x != y], key=lambda x: len(x))[::-1] for x in roots if len(x) > 2}
solos = [x[0] for x in rgroups.items() if len(x[1]) == 0]
lgroups = [(x[0], [*x[1], x[0]]) for x in rgroups.items() if len(x[1]) > 3]
sgroups = [(x[0], [*x[1], x[0]]) for x in rgroups.items() if 0 < len(x[1]) < 4]

In [None]:
pfx = [x.strip() for x in rdx_sort(list(set([f'{x}{(9-len(x)) * " "}' for x in nfx if x.startswith('_')])))]
sfx = [x.strip() for x in rdx_sort(list(set([f'{(9-len(x)) * " "}{x}' for x in nfx if x.endswith('_')])), mcd=True)]
pfx.extend(sfx)
nfx = pfx.copy()
roots = [x.strip() for x in rdx_sort(list(set([f'{x}{(9-len(x)) * " "}' for x in roots])))]

with open(f'C:\\Users\\BBA\\Coding\\NLP\\Embeddings\\data\\v0\\awork4', 'wb') as f:
    dump((roots, nfx), f)