In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

#https://stackoverflow.com/questions/41827983/right-way-to-calculate-the-cosine-similarity-of-two-word-frequency-dictionaries
#https://realpython.com/python-counter/

#from scipy.spatial.distance import cosine
#from sklearn.metrics.pairwise import cosine_similarity

import gc
from copy import deepcopy as dcp
import pandas as pd

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.token import *
from pygents.text import *
from pygents.util import *
from pygents.plot import plot_bars, plot_dict, matrix_plot


In [14]:
import math

def cosine_dic_parts(dic1,dic2):
    numerator = 0
    dena = 0
    for key1,val1 in dic1.items():
        numerator += val1*dic2.get(key1,0.0)
        dena += val1*val1
    denb = 0
    for val2 in dic2.values():
        denb += val2*val2
    return numerator, dena, denb

def cosine_dic(dic1,dic2):
    numerator, dena, denb = cosine_dic_parts(dic1,dic2)
    return numerator/math.sqrt(dena*denb) if numerator != 0 else 0.0

x0 = {'c':3,'a':1,'b':2}
x1 = {'a':1,'b':2,'c':3}
x2 = {'a':1,'b':2,'c':0}
x3 = {'a':1,'b':2,'d':3}
x4 = {'a':1,'e':2,'d':3}
x5 = {'a':999,'e':2,'d':3}
x6 = {'a':0.1,'e':2,'d':3}
x7 = {'f':1,'e':2,'d':3}
assert str(cosine_dic(x0,x1)) == "1.0"
assert str(cosine_dic(x0,x2)) == "0.5976143046671968"
assert str(cosine_dic(x1,x2)) == "0.5976143046671968"
assert str(cosine_dic(x1,x3)) == "0.35714285714285715"
assert str(cosine_dic(x1,x4)) == "0.07142857142857142"
assert str(cosine_dic(x1,x5)) == "0.26725950125174264"
assert str(cosine_dic(x1,x6)) == "0.007409643851431125"
assert str(cosine_dic(x1,x7)) == "0.0"

# computes cosine distance based on 2 dicts corresponding to 
# two vectors in complementary two segments of bi-segment vector space
def cosine_dic2(dica1,dicb1,dica2,dicb2):
    numerator1, dena1, denb1 = cosine_dic_parts(dica1,dicb1)
    numerator2, dena2, denb2 = cosine_dic_parts(dica2,dicb2)
    return (numerator1+numerator2)/math.sqrt((dena1+dena2)*(denb1+denb2)) if numerator1 != 0 or numerator2 != 0 else 0.0
assert str(cosine_dic2(x0,x1,{},{})) == "1.0"
assert str(cosine_dic2(x0,x2,{},{})) == "0.5976143046671968"
assert str(cosine_dic2(x1,x2,{},{})) == "0.5976143046671968"
assert str(cosine_dic2(x1,x3,{},{})) == "0.35714285714285715"
assert str(cosine_dic2(x1,x4,{},{})) == "0.07142857142857142"
assert str(cosine_dic2(x1,x5,{},{})) == "0.26725950125174264"
assert str(cosine_dic2(x1,x6,{},{})) == "0.007409643851431125"
assert str(cosine_dic2(x1,x7,{},{})) == "0.0"
assert str(cosine_dic2(x0,x1,x0,x1)) == "1.0"
assert str(cosine_dic2(x1,x7,x1,x7)) == "0.0"

def compute_similiarities(model,arity=1,debug=False):
    lst = []
    done = set()
    for a in model[0]:
        if len(a) == arity:
            a1 = model[1][a]
            a2 = model[2][a]
            for b in model[0]:
                if a != b and len(b) == arity and not (b,a) in done:
                    b1 = model[1][b]
                    b2 = model[2][b]
                    s = cosine_dic2(a1,b1,a2,b2)
                    done.add((a,b))
                    lst.append( (a,b,s) if a <= b else (b,a,s) )
            if debug:
                print(a)
    return lst


def compute_similiarities_from_dict(dic,debug=False):
    lst = []
    done = set()
    for a in dic:
            a1 = dic[a][0]
            a2 = dic[a][1]
            for b in dic:
                if a != b and not (b,a) in done:
                    b1 = dic[b][0]
                    b2 = dic[b][1]
                    s = cosine_dic2(a1,b1,a2,b2)
                    done.add((a,b))
                    lst.append( (a,b,s) if a <= b else (b,a,s) )
            if debug:
                print(a)
    return lst

def model_to_dict(model,arity=1,debug=False):
    copy = {}
    for a in model[0]:
        if len(a) == arity:
            copy[a] = (model[1][a] if a in model[1] else {}, model[2][a] if a in model[2] else {})
    return copy

def dict_merge(a,b):
    c = dcp(a)
    for key in b:
        if key in c:
            c[key] = c[key] + b[key]
        else:
            c[key] = b[key]
    return c
assert str(dict_merge({'a':0.2,'b':0.1},{'c':0.2,'b':0.1})) == "{'a': 0.2, 'b': 0.2, 'c': 0.2}"         

def join_letters(a,b):
    return "".join(sorted(list(a)+list(b)))
assert str(join_letters("1.2","zba")) == ".12abz"
    
def do_cluster(model,debug = False):
    copy = model_to_dict(model)
    if debug:
        print(len(copy))
    n = 0
    while True:
        simlst = compute_similiarities_from_dict(copy)
        simlst.sort(key=lambda tup: tup[2], reverse=False) # sort to end so we can be removing from the end
        length = len(simlst)
        if length == 0:
            break # root
        top = simlst[length - 1]
        merged_name = top[0]+top[1]
        if debug:
            print(n,len(copy),length,top[0],'+',top[1],'=>',top[2])
        copy[ join_letters(top[0],top[1]) ] = ( dict_merge(copy[top[0]][0],copy[top[1]][0]), dict_merge(copy[top[0]][1],copy[top[1]][1]) )
        del copy[top[0]]
        del copy[top[1]]
        if n > 100:
            break
        n += 1
    if debug:
        print(len(copy))
    

    

In [3]:
base = FreedomTokenizer(name='data/models/brown_nolines_chars_7a',max_n=7,mode='chars',debug=False)
do_cluster(base.model,debug=True)


68
0 68 2278 , + ; => 0.9994755444235053
1 67 2211 ,; + . => 0.9991546049221308
2 66 2145 ! + ? => 0.9959599780890476
3 65 2080 [ + { => 0.9877099439908715
4 64 2016 > + } => 0.9868727646622297
5 63 1953 ( + [{ => 0.9849595859613685
6 62 1891 ,.; + : => 0.9820916806352772
7 61 1830 2 + 3 => 0.9758629641063381
8 60 1770 ([{ + < => 0.9718134389427836
9 59 1711 >} + ] => 0.9717276422077935
10 58 1653 ) + >]} => 0.9651380673786327
11 57 1596 (<[{ + ~ => 0.9619265058839013
12 56 1540 # + + => 0.9589497629016187
13 55 1485 & + ,.:; => 0.9549541223121583
14 54 1431 4 + 7 => 0.9545051585990892
15 53 1378 &,.:; + )>]} => 0.9531771519616812
16 52 1326 #+ + @ => 0.950085582077181
17 51 1275 23 + 47 => 0.9444027022724498
18 50 1225 5 + 6 => 0.9400552781903716
19 49 1176 !? + &),.:;>]} => 0.932471578548309
20 48 1128 #+@ + _ => 0.9192246817696368
21 47 1081 b + p => 0.914831467000714
22 46 1035 $ + (<[{~ => 0.9012754833110997
23 45 990 !&),.:;>?]} + y => 0.8955247255616491
24 44 946 bp + w => 0.888

In [4]:
model_compress_with_loss(base.model,0.0001)
do_cluster(base.model,debug=True)


64
0 64 2016 , + ; => 0.9994755482324188
1 63 1953 ,; + . => 0.9991541566507661
2 62 1891 ! + ? => 0.9959599780890476
3 61 1830 [ + { => 0.9877099439908715
4 60 1770 > + } => 0.9868727646622297
5 59 1711 ( + [{ => 0.9849595859613685
6 58 1653 ,.; + : => 0.9820893944359783
7 57 1596 2 + 3 => 0.9758629641063381
8 56 1540 ([{ + < => 0.9718134389427836
9 55 1485 >} + ] => 0.9717276422077935
10 54 1431 ) + >]} => 0.9651380673786327
11 53 1378 (<[{ + ~ => 0.9619265058839013
12 52 1326 # + + => 0.9589497629016187
13 51 1275 & + ,.:; => 0.9549524401317208
14 50 1225 4 + 7 => 0.9545051585990892
15 49 1176 &,.:; + )>]} => 0.9531714890687416
16 48 1128 #+ + @ => 0.950085582077181
17 47 1081 23 + 47 => 0.9444027022724498
18 46 1035 5 + 6 => 0.9400552781903716
19 45 990 !? + &),.:;>]} => 0.9324717587913735
20 44 946 #+@ + _ => 0.9192246817696368
21 43 903 b + p => 0.9148296693120033
22 42 861 $ + (<[{~ => 0.9012754833110997
23 41 820 !&),.:;>?]} + y => 0.8955212902858396
24 40 780 bp + w => 0.88882

In [5]:
del base
n = gc.collect()


In [6]:
base = FreedomTokenizer(name='data/models/gutenberg_brown_social_media_chars_7a',max_n=7,mode='chars',debug=False)
#do_cluster(base.model,debug=True) # does ugly glyphs...


3201
0 3201 5121600 🉐 + 🧽 => 1.0
1 3200 5118400 🈺 + 🉐🧽 => 1.0
2 3199 5115201 🈺🉐🧽 + 🧅 => 1.0
3 3198 5112003 🈺🉐🧅🧽 + 🚱 => 1.0
4 3197 5108806 🈺🉐🚱🧅🧽 + 🚠 => 1.0
5 3196 5105610 ⏪ + 🈺🉐🚠🚱🧅🧽 => 1.0
6 3195 5102415 ⊲ + ⏪🈺🉐🚠🚱🧅🧽 => 1.0


KeyboardInterrupt: 

In [7]:
model_compress_with_loss(base.model,0.01)
do_cluster(base.model,debug=True)


42
0 42 861 1 + 2 => 0.9662188957988025
1 41 820 4 + 5 => 0.9628337781011256
2 40 780 3 + 45 => 0.9341870279415111
3 39 741 12 + 345 => 0.900721007322586
4 38 703 d + g => 0.8895109907195066
5 37 666 b + w => 0.8861604640428483
6 36 630 bw + m => 0.873403494475321
7 35 595 bmw + f => 0.8720058000550613
8 34 561 bfmw + j => 0.8632221169013085
9 33 528 a + i => 0.8416465655778387
10 32 496 v + z => 0.8271750162242487
11 31 465 k + vz => 0.8231682220071532
12 30 435 dg + s => 0.8216948043156402
13 29 406 kvz + r => 0.8215272257151841
14 28 378 dgs + y => 0.8120614386007088
15 27 351 c + t => 0.8102559961302707
16 26 325 bfjmw + ct => 0.8184648490123563
17 25 300 , + . => 0.8034399670310892
18 24 276 bcfjmtw + p => 0.7927808267991239
19 23 253 krvz + l => 0.7912461313652792
20 22 231 ai + o => 0.7752204373490754
21 21 210 - + dgsy => 0.7696451106379618
22 20 190 -dgsy + bcfjmptw => 0.7595650210787618
23 19 171 -bcdfgjmpstwy + klrvz => 0.7926437115453245
24 18 153 aio + u => 0.7317107727978

In [8]:
del base
n = gc.collect()


In [9]:
base = FreedomTokenizer(name='data/models/rusage_test_chars_7a',max_n=7,mode='chars',debug=False)
do_cluster(base.model,debug=True)


127
0 127 8001 \ + ± => 1.0
1 126 7875 , + ; => 0.9950984923474814
2 125 7750 « + „ => 0.9908329253065113
3 124 7626 . + … => 0.9897031544482772
4 123 7503 ( + «„ => 0.9882463826596951
5 122 7381 .… + : => 0.9839560363153973
6 121 7260 6 + 7 => 0.9824225377186522
7 120 7140 ¶ + † => 0.9733285267845753
8 119 7021 5 + 67 => 0.9679180675615426
9 118 6903 ! + ? => 0.9671685765022833
10 117 6786 \± + і => 0.9495921438262921
11 116 6670 д + н => 0.9486751709688159
12 115 6555 ,; + © => 0.9451947993287309
13 114 6441 " + “ => 0.9407945007700845
14 113 6328 !? + .:… => 0.9350424829622549
15 112 6216 # + { => 0.9327189736179531
16 111 6105 567 + 8 => 0.9318798716163456
17 110 5995 дн + ф => 0.9228859349477891
18 109 5886 в + днф => 0.9201195120084983
19 108 5778 вднф + м => 0.9241418240582734
20 107 5671 и + у => 0.9186284680108879
21 106 5565 ) + » => 0.9160170748383359
22 105 5460 ,;© + } => 0.9150782325908327
23 104 5356 а + иу => 0.9138478666520997
24 103 5253 аиу + е => 0.9190489943311715


In [10]:
del base
n = gc.collect()


In [11]:
base = FreedomTokenizer(name='data/models/rusage_chars_7a',max_n=7,mode='chars',debug=False)
do_cluster(base.model,debug=True)


149
0 149 11026 « + „ => 0.9967050058385775
1 148 10878 , + ; => 0.9960720820729173
2 147 10731 ѕ + ґ => 0.993097636176704
3 146 10585 6 + 7 => 0.9906112520092044
4 145 10440 ( + «„ => 0.9898797779024109
5 144 10296 . + … => 0.9895293077352101
6 143 10153 («„ + “ => 0.9886979300419079
7 142 10011 .… + : => 0.9881337841967766
8 141 9870 5 + 67 => 0.9837673587818624
9 140 9730 + + і => 0.980919334232677
10 139 9591 > + › => 0.9777782797011448
11 138 9453 » + ” => 0.9773398961064365
12 137 9316 ! + ? => 0.9711712806410331
13 136 9180 < + ‹ => 0.9663916021596136
14 135 9045 ѕґ + љ => 0.9652814923729348
15 134 8911 § + • => 0.9606415593375218
16 133 8778 # + $ => 0.960418079731181
17 132 8646 ,; + © => 0.9567749939296843
18 131 8515 * + · => 0.956105016201736
19 130 8385 ѓ + ‚ => 0.9509203148193569
20 129 8256 д + н => 0.9471402527348005
21 128 8128 8 + 9 => 0.945267196166392
22 127 8001 *· + +і => 0.9423501996214542
23 126 7875 ¤ + µ => 0.9366426113987674
24 125 7750 !? + .:… => 0.93480216

In [12]:
model_compress_with_loss(base.model,0.0001)
do_cluster(base.model,debug=True)


93
0 93 4278 , + ; => 0.9960720901169786
1 92 4186 6 + 7 => 0.9906112520092044
2 91 4095 ( + « => 0.9898510958283735
3 90 4005 . + … => 0.9895286516736947
4 89 3916 (« + “ => 0.9886904394215923
5 88 3828 .… + : => 0.9881330656287035
6 87 3741 5 + 67 => 0.9837673587818624
7 86 3655 » + ” => 0.9773398381197401
8 85 3570 ! + ? => 0.9711711561352966
9 84 3486 ,; + © => 0.9567741486671737
10 83 3403 д + н => 0.9471392327281735
11 82 3321 8 + 9 => 0.9452668887326737
12 81 3240 !? + .:… => 0.9348009628792953
13 80 3160 ) + »” => 0.9292657979094886
14 79 3081 в + дн => 0.9188736493577047
15 78 3003 вдн + ф => 0.9317834917860469
16 77 2926 вднф + м => 0.9230132610826455
17 76 2850 и + у => 0.91802245323981
18 75 2775 а + иу => 0.9133038569580643
19 74 2701 аиу + е => 0.9192362685748857
20 73 2628 аеиу + о => 0.9129834184958411
21 72 2556 б + вдмнф => 0.8894132606723331
22 71 2485 бвдмнф + к => 0.8781349379776531
23 70 2415 !.:?… + ,;© => 0.8673020868253464
24 69 2346 ж + ш => 0.8603463926769098

In [13]:
del base
n = gc.collect()


In [None]:
#TODO trees
#TODO replace cosine similarity with other one, accounting for "distribution shape"

