In [5]:
import random
from pysuffixarray.core import SuffixArray
from bayes_opt import BayesianOptimization

alphabet = [' ', 'a', 'ą', 'b', 'c', 'ć', 'd', 'e', 'ę', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'ł', 'm', 'n',
            'ń', 'o', 'ó', 'p', 'q', 'r', 's', 'ś', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ż', 'ź']

In [6]:
class RMQ:
    def __init__(self, n):
        self.sz = 1
        self.inf = (1 << 31) - 1
        while self.sz <= n: self.sz = self.sz << 1
        self.dat = [self.inf] * (2 * self.sz - 1)

    def update(self, idx, x):
        idx += self.sz - 1
        self.dat[idx] = x
        while idx > 0:
            idx = (idx - 1) >> 1
            self.dat[idx] = min(self.dat[idx * 2 + 1], self.dat[idx * 2 + 2])

    def query(self, a, b):
        return self.query_help(a, b, 0, 0, self.sz)

    def query_help(self, a, b, k, l, r):
        if r <= a or b <= l:
            return 9999999
        elif a <= l and r <= b:
            return self.dat[k]
        else:
            return min(self.query_help(a, b, 2 * k + 1, l, (l + r)>>1),
                        self.query_help(a, b, 2 * k + 2, (l + r) >> 1, r))

In [7]:
def load_text(file='.\P4b-data\sentences_for_task1.txt'):
    complete_text = ''
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            line = line.strip().lower()
            complete_text += ' ' + line

    complete_text = complete_text + ' '
    return complete_text

text = load_text()
sabase = SuffixArray(text)
SA = sabase.suffix_array()
LCP = sabase.longest_common_prefix()

In [128]:
precomp_length=18
precomp_adj=6

def precompute_counts2(ln):
    word_counts = {}
    template = {}
    for a in alphabet:
        template[a] = 0
    adj_table = {}
    for i in range(len(SA)):
        for j in range(1, ln):
            wrd = text[SA[i]:SA[i]+j]
            if wrd not in word_counts:
                word_counts[wrd] = [0, i]
                if j <= precomp_adj:
                    adj_table[wrd] = [[],[]]
                    for q in range(3):
                        adj_table[wrd][0].append(template.copy())
                        adj_table[wrd][1].append(template.copy())
            word_counts[wrd][0] += 1
            if j <= precomp_adj:
                pre_b = False
                suf_b = False
                for q in range(3):
                    if SA[i]-q-1 > 0:
                        if not pre_b:
                            adj_table[wrd][0][q][text[SA[i]-q-1]] += 1
                            if text[SA[i]-q-1] == ' ':
                                pre_b = True
                        else:
                            adj_table[wrd][0][q][' '] += 1
                    if SA[i]+j+q < len(text):
                        if not suf_b:
                            adj_table[wrd][1][q][text[SA[i]+j+q]] += 1
                            if text[SA[i]+j+q] == ' ':
                                suf_b = True
                        else:
                            adj_table[wrd][1][q][' '] += 1

    return word_counts, adj_table


def precompute_counts(ln):
    word_counts = {}
    template = {}
    for a in alphabet:
        template[a] = 0
    adj_table = {}
    for i in range(len(SA)):
        for j in range(1, ln):
            wrd = text[SA[i]:SA[i]+j]
            if wrd not in word_counts:
                word_counts[wrd] = [0, i]
                if j <= precomp_adj:
                    adj_table[wrd] = [template.copy(), template.copy()]
            word_counts[wrd][0] += 1
            if j <= precomp_adj:
                if SA[i]-1 > 0:
                    adj_table[wrd][0][text[SA[i]-1]] += 1
                if SA[i]+j < len(text):
                    adj_table[wrd][1][text[SA[i]+j]] += 1

    return word_counts, adj_table

In [129]:
word_counts = 0
adj_table = 0

In [130]:
word_counts, adj_table = precompute_counts(precomp_length)

In [10]:
rmq = RMQ(len(SA))

for i in range(len(LCP)):
    rmq.update(i, LCP[i])

In [11]:
print(LCP[0:5])
print(text[SA[2]:SA[2]+10])
print(text[SA[3]:SA[3]+10])
rmq.query(2, 1)

[0, 1, 3, 3, 2]
 aalbumteż
 aanzaczął


9999999

In [12]:
# count the average amount of each letter
def count_letters():
    letters = {}
    for a in alphabet:
        letters[a] = 0
    for l in text:
        letters[l] += 1
    sm = 0
    for a in letters:
        sm += letters[a]
    return letters, sm

letters, letters_sum = count_letters()

In [13]:
def check_prefix(pre, w, init=0):
    ln = len(pre)
    lw = len(w)
    for i in range(init, ln, 1):
        if lw <= i:
            return i-1, True
        if w[i] != pre[i]:
            return i, w[i] < pre[i]
    return ln, True

def find_highest(w, li, ri):
    while li < ri:
        preli = li
        preri = ri
        i = (li+ri)//2
        lw = len(w)
        ans = rmq.query(li+1, i+1)
        if ans < lw:
            ri = i
        else:
            li = i
        if preli == li and preri == ri:
            return li
    return li

def find_lowest(w, li, ri):
    while li < ri:
        preli = li
        preri = ri
        i = (li+ri)//2
        lw = len(w)
        ans = rmq.query(i+1, ri+1)
        if ans < lw:
            li = i
        else:
            ri = i
        if preli == li and preri == ri:
            return ri
    return ri

def count_words(w):
    if len(w) <= precomp_length:
        if w in word_counts:
            return word_counts[w]
        return 0, 0

    li = 0
    ri = len(SA)-1
    lw = len(w)
    i = -1
    ans = 0
    while li < ri:
        preli = li
        preri = ri
        prei = i
        i = (li+ri)//2
        if prei < i:
            init = rmq.query(prei+1, i+1)
        else:
            init = rmq.query(i+1, prei+1)
        init = min(init, ans)
        ans, direction = check_prefix(w, text[SA[i]:], init)
        if ans < lw:
            if direction:
                li = i
            else:
                ri = i
            if preli == li and preri == ri:
                return 0, 0
        else:
            h = find_highest(w, i, ri)
            l = find_lowest(w, li, i)
            return h - l + 1, l


In [110]:
def find_split(w, min_split=1, a=1, b=1):
    abc = check_adj(w)
    adj00 = check_alpha(abc[0])
    adj01 = check_alpha(abc[1])
    bc = max(adj00, adj01)
    if bc == 0:
        bc = 0.0001
    hs = bc
    #bc, _ = count_words(w)
    #hs = bc
    si = 0

    for i in range(1, len(w)):
        w1 = w[:i]
        w2 = w[i:]
        if w2 not in potential:
            #bw1, _ = count_words(w1)
            #bw2, _ = count_words(w2)
            adj1 = check_alpha(check_adj(w1)[1])
            adj2 = check_alpha(check_adj(w2)[0])

            if max(adj1, adj2) < hs:
                hs = max(adj1, adj2)
                si = i
        #if min(bw1, bw2) > hs:
        #    hs = min(bw1, bw2)
        #    si = i

    if hs > bc * min_split:
        return True, si, hs/bc
    else:
        return False, si, hs/bc

In [15]:
c, p = count_words("ztabu")
print(c, p)
print(text[SA[p-1]:SA[p-1]+10])
print(text[SA[p]:SA[p]+10])
print(text[SA[p+c-1]:SA[p+c-1]+10])
print(text[SA[p+c]:SA[p+c]+10])


12 3151872
ztabowychl
ztabuantyk
ztabuznale
ztabwehrma


In [126]:
def check_adj2(w):
    if len(w) <= precomp_adj:
        return adj_table[w]
    bs, sp = count_words(w)
    prefixes = []
    suffixes = []
    for j in range(3):
        prefixes.append({})
        suffixes.append({})
    for a in alphabet:
        for j in range(3):
            prefixes[j][a] = 0
            suffixes[j][a] = 0
    for i in range(sp, sp+bs):
        pre_b = False
        suf_b = False
        for j in range(3):
            if SA[i] - j - 1 > 0:
                if not pre_b:
                    prefixes[j][text[SA[i]-j-1]] += 1
                    if text[SA[i]-j-1] == ' ':
                        pre_b = True
                else:
                    prefixes[j][' '] += 1
            if SA[i] + len(w) + j < len(text):
                if not suf_b:
                    suffixes[j][text[SA[i]+len(w)+j]] += 1
                    if text[SA[i]+len(w)+j] == ' ':
                        suf_b = True
                else:
                    suffixes[j][' '] += 1

    return prefixes, suffixes

def check_adj(w):
    if len(w) <= precomp_adj:
        return adj_table[w]
    bs, sp = count_words(w)
    prefixes = {}
    suffixes = {}
    if bs > 1000000:
        for a in alphabet:
            pre, _ = count_words(a + w)
            suf, _ = count_words(w + a)
            prefixes[a] = pre
            suffixes[a] = suf
    else:
        for a in alphabet:
            prefixes[a] = 0
            suffixes[a] = 0
        for i in range(sp, sp+bs):
            if SA[i] - 1 > 0:
                prefixes[text[SA[i]-1]] += 1
            if SA[i] + len(w) < len(text):
                suffixes[text[SA[i]+len(w)]] += 1

    return prefixes, suffixes

In [119]:
alpha_a = 1
alpha_b = 1
alpha_c = 1

In [144]:
def check_alpha2(alpha):
    aa = alpha_a
    ab = alpha_b
    ac = alpha_c
    score = []
    for neigh in alpha:
        sm = 0
        mx = 0
        for l in neigh:
            sm += neigh[l]
            if neigh[l] > mx:
                mx = neigh[l]

        lscore = 0.0
        if sm == 0:
            sm = 1
        for a in alphabet:
            d1 = (letters[a])/letters_sum
            d2 = (neigh[a])/sm
            lscore += ((d2-d1)*(d2-d1))

        if neigh[' '] == 0 and mx > 15:
            lscore += 0.1

        if neigh[' ']/sm > 0.45:
            lscore = 0.05

        score.append(lscore)

    return float(score[0])*aa + float(score[1])*ab + float(score[2])*ac

def check_alpha(alpha):
    sm = 0
    mx = 0
    for l in alpha:
        sm += alpha[l]
        if alpha[l] > mx:
            mx = alpha[l]

    score = 0
    if sm == 0:
        sm = 1
    for a in alphabet:
        d1 = (letters[a])/letters_sum
        d2 = (alpha[a])/sm
        score += (d2-d1)*(d2-d1)
    if alpha[' '] == 0 and mx > 15:
        score += 0.5

    return score

In [18]:
def find_phrases(w):
    c, p = count_words(w)
    phrases = []
    for i in range(p, p+c, 1):
        lf = SA[i]
        while text[lf] != ' ':
            lf -= 1
        rt = SA[i]
        while text[rt] != ' ':
            rt += 1
        phrases.append((text[lf+1:rt], SA[i]-lf))
    return phrases

In [19]:
def split_phrase(ph, p, l):
    w1 = ph[:p-1]
    w2 = ph[p+l-1:]
    bs1 = 2
    bs2 = 2

    for i in range(5, min(12,len(w2))):
        wrd = w2[:i]
        ww1 = check_adj(wrd)
        ans11 = check_alpha(ww1[0])
        _, si, _ = find_split(wrd)
        if si > 0:
            ww21 = wrd[:si]
            ans11 = min(ans11, check_alpha(check_adj(ww21)[0]))
        #print(wrd, ans11)
        if ans11 < bs2:
            bs2 = ans11

    for i in range(5, min(12, len(w1))):
        wrd = w1[-i:]
        ww1 = check_adj(wrd)
        ans12 = check_alpha(ww1[1])
        _, si, _ = find_split(wrd)
        if si > 0:
            ww21 = wrd[:si]
            ans12 = min(ans12, check_alpha(check_adj(ww21)[0]))
        #print(wrd, ans12)
        if ans12 < bs1:
            bs1 = ans12

    return bs1, bs2


In [20]:
#czasach 0.031123297961619303 0.020721112185516068
#eliczne 0.6118221203851923 0.0252523649090908
#niem 0.023549158377852437 999
#demo 0.03331824185837327 0.14103016019263753
#kowych 0.6639164598939549 999
#wzięte 0.018950865005565084 0.12897087396779425
def check_phrases(w, mx=50):
    phs = find_phrases(w)
    phs = random.sample(phs, min(mx,len(phs)))
    mn1 = 0
    mn2 = 0
    for ph in phs:
        sc1, sc2 = split_phrase(ph[0], ph[1], len(w))
        if sc1 == 2:
            sc1 = 0.05
        if sc2 == 2:
            sc2 = 0.05
        mn1 += sc1
        mn2 += sc2

    if len(phs) == 0:
        return 1, 1
    return mn1/len(phs), mn2/len(phs)

def check_phrases0(w, mx=50):
    phs = find_phrases(w)
    phs = random.sample(phs, min(mx,len(phs)))
    mn1 = 2
    mn2 = 2
    for ph in phs:
        sc1, sc2 = split_phrase(ph[0], ph[1], len(w))
        if sc1 < mn1:
            mn1 = sc1
        if sc2 < mn2:
            mn2 = sc2
    if len(w) > 4:
        if mn1 == 2:
            mn1 = 0.05
        if mn2 == 2:
            mn2 = 0.05
    return mn1, mn2

In [21]:
def find_suffixes(breakpoint = 0.04, breakpoint2 = 100, breakpoint3 = 0.001):
    suffixes = {}
    for word in word_counts:
        if len(word) <= 4 and count_words(word)[0] > breakpoint2:
            adjs = check_adj(word)
            sm1 = 0
            for a in alphabet:
                sm1 += adjs[0][a]
            if adjs[0][' ']/sm1 <= breakpoint3:
                sm2 = 0
                for a in alphabet:
                    sm2 += adjs[1][a]
                if adjs[1][' ']/sm2 > breakpoint*len(word):
                    suffixes[word] = adjs[1][' ']/sm2
    return suffixes

potential = find_suffixes()

In [22]:
def resolve_word(w, debug=False):
    if len(w) <= 3:
        ans, s, _ = find_split(w, 50)
    else:
        ans, s, _ = find_split(w)
    if ans:
        if not debug:
            return False
        print(w[:s], w[s:])
        ans1 = check_alpha(check_adj(w[:s])[0])
        ans2 = check_alpha(check_adj(w[s:])[1])
        return False, ans, ans1, ans2
    else:
        ww = check_adj(w)
        ans1 = check_alpha(ww[0])
        ans2 = check_alpha(ww[1])

    if not debug:
        if ans1 and ans2:
            return True
        return False
    else:
        if ans1 and ans2:
            return True, ans, ww, ans1, ans2
        return False, ans, ww, ans1, ans2

In [143]:
def resolve_pair(w1, w2, a=0.15, b=3.5, c=3, d=0.2, e=0.2):
    _, _, h1 = find_split(w1)
    _, _, h2 = find_split(w2)

    ww1 = check_adj(w1)
    ans11 = check_alpha(ww1[0])
    ans12 = check_alpha(ww1[1])
    ww2 = check_adj(w2)
    ans21 = check_alpha(ww2[0])
    ans22 = check_alpha(ww2[1])
    ans1 = max(ans11, ans12)
    ans2 = max(ans21, ans22)

    if not (h1 == 1 and h2 == 1):
        if h2 == 1:
            return -1, h1, h2, ans1, ans2
        if h1 == 1:
            return 1, h1, h2, ans1, ans2

    sp11, sp12 = check_phrases(w1)
    sp21, sp22 = check_phrases(w2)

    if not ((sp11 > sp21 * b or sp12 > sp22 * b) and (sp11 * b < sp21 or sp12 * b < sp22)):
        if sp11 > sp21 * b or sp12 > sp22 * b:
            return -1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22
        if sp11 * b < sp21 or sp12 * b < sp22:
            return 1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22

    if h1/h2 < a:
        return -1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22
    if h2/h1 < a:
         return 1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22

    if ans1 > ans2 * c:
        return -1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22
    if ans1 * c < ans2:
        return 1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22

    if  sp11 + sp12 - d*h1 + e*ans1 > sp21 + sp22 - d*h2 + e*ans2:
        return -1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22
    else:
        if sp11 + sp12 - d*h1 + e*ans1 < sp21 + sp22 - d*h2 + e*ans2:
            return 1, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22
        else:
            return 0, h1, h2, ans1, ans2, sp11, sp12, sp21, sp22

In [24]:
def load_tests(file='.\P4b-data\\test_for_task1.txt'):
    complete_database = []
    i = 0
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            #if i > 10:
            #    break
            line = line.strip()
            line = line.split(' ')
            complete_database.append(line[0])
            complete_database.append(line[1])
            i += 1

    return complete_database

tests = load_tests()

In [122]:
def resolve_tests(tests, a=0.0, b=3.239, c=8.251, d=0.2, e=0.1538, f=1, g=1, h=1, debug=False):
    global alpha_a
    global alpha_b
    global alpha_c
    alpha_a = f
    alpha_b = g
    alpha_c = h
    score = 0
    for i in range(0, len(tests), 2):
        res = resolve_pair(tests[i], tests[i+1], a, b, c, d, e)
        if res[0] == 1:
            score += 1
        if res[0] == 0:
            score += 0.5
        if debug:
            print(tests[i], tests[i+1], res)

    return (score*2) / len(tests)

In [123]:
k = 500
def optimize_resolve(a, b, c, d, e, f, g, h):
    return resolve_tests(tests[k*10:k*12], a, b, c, d, e, f, g, h)

In [145]:
resolve_tests(tests[k:k*2], debug=True)

swoje chowa (1, 1.0, 0.14120494658191463, 0.15739842577833676, 0.46381582109037145)
polski cznymi (1, 1.0, 0.11066580998289224, 0.21436282158660086, 0.7938480250074085)
nasmarować oposławnio (1, 0.4960170852957674, 0.13820604806675899, 0.9752336570525748, 0.9858727122216413, 0.04444737836324626, 0.05, 0.06901278651099622, 0.06337552207506761)
lepiej ficzne (1, 1.0, 0.7911175446527434, 0.23507165206523925, 0.9101832861054792)
publiczne umiejscow (1, 1.0, 0.2310851512692286, 0.38792912715312855, 0.2839920986110165)
podane podcza (1, 0.3818226003319466, 0.02397604130368258, 0.1825797677729446, 1.4625537435346092, 0.0544833290084581, 0.1292967550686197, 0.052444574877828824, 0.04313036779524498)
nastąpił telskani (1, 1.0, 0.18834464891954317, 0.6142139677264921, 0.8966303124922654)
ich hod (1, 0.8534340861165657, 0.0882550181704508, 0.22824115809338236, 0.907792597085061, 0.04946860235839825, 0.05790591556769513, 0.09165248798883698, 0.3024749942794862)
zostały kluczow (1, 0.14236027298301

0.936

In [46]:
resolve_tests(tests[k*80:k*120], debug=False)

0.9063

In [124]:
# Bounded region of parameter space
pbounds = {'a': (0, 1), 'b': (0,10), 'c': (0,10), 'd': (0,1), 'e': (0,1), 'f': (0,1), 'g': (0,1), 'h': (0, 1)}

optimizer4 = BayesianOptimization(
    f=optimize_resolve,
    pbounds=pbounds,
)

optimizer4.maximize(
    init_points=25,
    n_iter=350,
)

print(optimizer4.max)

|   iter    |  target   |     a     |     b     |     c     |     d     |     e     |     f     |     g     |     h     |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.794   [0m | [0m 0.5627  [0m | [0m 3.993   [0m | [0m 4.77    [0m | [0m 0.1602  [0m | [0m 0.2472  [0m | [0m 0.2217  [0m | [0m 0.6642  [0m | [0m 0.7645  [0m |
| [95m 2       [0m | [95m 0.836   [0m | [95m 0.6609  [0m | [95m 5.456   [0m | [95m 3.796   [0m | [95m 0.02677 [0m | [95m 0.5307  [0m | [95m 0.6108  [0m | [95m 0.8367  [0m | [95m 0.3514  [0m |
| [0m 3       [0m | [0m 0.832   [0m | [0m 0.8227  [0m | [0m 3.4     [0m | [0m 9.352   [0m | [0m 0.507   [0m | [0m 0.3222  [0m | [0m 0.3881  [0m | [0m 0.5918  [0m | [0m 0.3769  [0m |
| [0m 4       [0m | [0m 0.836   [0m | [0m 0.5328  [0m | [0m 2.369   [0m | [0m 3.653   [0m | [0m 0.1272  [0m | [0m 0.3558  [0m 

  if h1/h2 < a:
  if h2/h1 < a:
  if h1/h2 < a:
  if h2/h1 < a:


KeyboardInterrupt: 

In [50]:
optimizer2.maximize(
    n_iter=25
)

|   iter    |  target   |     a     |     b     |     c     |     d     |     e     |
-------------------------------------------------------------------------------------
| [0m 31      [0m | [0m 0.888   [0m | [0m 0.7746  [0m | [0m 8.77    [0m | [0m 5.495   [0m | [0m 0.7412  [0m | [0m 0.01464 [0m |
| [0m 32      [0m | [0m 0.902   [0m | [0m 0.3485  [0m | [0m 0.9864  [0m | [0m 5.861   [0m | [0m 0.3916  [0m | [0m 0.1867  [0m |
| [0m 33      [0m | [0m 0.896   [0m | [0m 0.9486  [0m | [0m 1.311   [0m | [0m 9.906   [0m | [0m 0.2928  [0m | [0m 0.8464  [0m |
| [0m 34      [0m | [0m 0.888   [0m | [0m 0.5447  [0m | [0m 5.746   [0m | [0m 9.874   [0m | [0m 0.8901  [0m | [0m 0.8948  [0m |
| [0m 35      [0m | [0m 0.902   [0m | [0m 0.1643  [0m | [0m 1.126   [0m | [0m 3.932   [0m | [0m 0.007091[0m | [0m 0.3607  [0m |
| [0m 36      [0m | [0m 0.906   [0m | [0m 0.0     [0m | [0m 6.05    [0m | [0m 7.845   [0m | [0m 0.0     [0

In [51]:
print(optimizer2.max)

{'target': 0.91, 'params': {'a': 0.0, 'b': 5.155686764517385, 'c': 5.183080271119191, 'd': 0.0, 'e': 0.0}}


In [None]:
check_phrases("minie", mx=50)

In [48]:
res = optimizer.res
res.sort(key= lambda x: x["target"], reverse=True)
for i in range(25):
    params = res[i]["params"]
    print(params)
    print(optimize_resolve(params["a"], params["b"], params["c"], params["d"], params["e"]))


{'a': 1.0, 'b': 2.0313652123814605, 'c': 7.871869558882175, 'd': 1.0, 'e': 0.0}
0.88
{'a': 1.0, 'b': 1.7242774560196796, 'c': 7.742000801077994, 'd': 0.26088228565795984, 'e': 1.0}
0.874
{'a': 1.0, 'b': 1.8494571095922747, 'c': 8.367724467643741, 'd': 0.0, 'e': 1.0}
0.876
{'a': 0.89344273412914, 'b': 1.676773747090854, 'c': 2.0109017843920007, 'd': 0.6250908895471177, 'e': 0.04132627162571467}
0.88
{'a': 1.0, 'b': 1.659555358435471, 'c': 5.237112017274674, 'd': 0.0, 'e': 0.0}
0.882
{'a': 1.0, 'b': 1.6946088948595042, 'c': 6.550546195504436, 'd': 0.0, 'e': 0.5214915459281337}
0.882
{'a': 0.9902703917433712, 'b': 1.8828795028659706, 'c': 2.262446484717012, 'd': 0.6058242108032431, 'e': 0.09687019875439362}
0.878
{'a': 1.0, 'b': 1.8955154308302125, 'c': 7.668208797269095, 'd': 1.0, 'e': 1.0}
0.878
{'a': 1.0, 'b': 1.8123855803622844, 'c': 3.3982361449263774, 'd': 0.0, 'e': 0.0}
0.874
{'a': 1.0, 'b': 1.8339040659480945, 'c': 1.7135540395376185, 'd': 0.0, 'e': 0.5716405936132313}
0.874
{'a':

In [None]:
phs = find_phrases("dzisiejszego")
print(phs)
split_phrase(phs[1][0], phs[1][1], len(phs))

In [117]:
tt = "piszę"
print(find_split(tt))
print(count_words(tt))
print(check_adj(tt))
print(check_alpha(check_adj(tt)[0]))
print(check_alpha(check_adj(tt)[1]))
print(find_phrases(tt)[:2])
print(check_phrases(tt))

(False, 4, 0.7001670004687783)
[11, 2111061]
[[{' ': 2, 'a': 1, 'ą': 0, 'b': 0, 'c': 0, 'ć': 0, 'd': 1, 'e': 3, 'ę': 0, 'f': 0, 'g': 0, 'h': 0, 'i': 0, 'j': 1, 'k': 0, 'l': 0, 'ł': 0, 'm': 2, 'n': 0, 'ń': 0, 'o': 1, 'ó': 0, 'p': 0, 'q': 0, 'r': 0, 's': 0, 'ś': 0, 't': 0, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0, 'ż': 0, 'ź': 0}, {' ': 3, 'a': 0, 'ą': 0, 'b': 0, 'c': 0, 'ć': 0, 'd': 0, 'e': 1, 'ę': 0, 'f': 0, 'g': 0, 'h': 0, 'i': 4, 'j': 0, 'k': 0, 'l': 0, 'ł': 0, 'm': 0, 'n': 0, 'ń': 0, 'o': 1, 'ó': 0, 'p': 0, 'q': 0, 'r': 0, 's': 0, 'ś': 0, 't': 0, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 1, 'z': 1, 'ż': 0, 'ź': 0}, {' ': 4, 'a': 0, 'ą': 0, 'b': 0, 'c': 0, 'ć': 0, 'd': 0, 'e': 0, 'ę': 0, 'f': 0, 'g': 0, 'h': 0, 'i': 1, 'j': 0, 'k': 0, 'l': 0, 'ł': 0, 'm': 0, 'n': 4, 'ń': 0, 'o': 0, 'ó': 0, 'p': 1, 'q': 0, 'r': 0, 's': 0, 'ś': 0, 't': 0, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 1, 'ż': 0, 'ź': 0}], [{' ': 3, 'a': 0, 'ą': 0, 'b': 0, 'c': 0, 'ć': 0, 'd': 3, 'e': 0, 'ę': 0, 'f': 0, 