In [8]:
import itertools
import re
import random
import pandas as pd
import numpy as np
import math
from sklearn import tree
from decision_trees import *
from timeit import default_timer as timer
from sklearn.ensemble import RandomForestClassifier

base_forms = ["adj", "adja", "adjc", "adjp", "adv", "burk", "depr", "ger", "conj", "comp", "num", "pact",
               "pant", "pcon", "ppas", "ppron12", "ppron3", "pred", "prep", "siebie", "subst", "verb", "brev",
               "interj", "qub"]

verb_forms = ["nom", "gen", "acc", "dat", "inst", "loc", "voc"]

raw_form = {"subst:nom":0, "subst:gen":1, "subst:acc":2, "subst:dat":3, "subst:inst":4, "subst:loc":5, "subst:voc":6,
            "adj":7, "adja":8, "adjc":9, "adjp":10, "adv":11, "burk":12, "depr":13, "ger":14, "conj":15, "comp":16, "num":17, "pact":18,
            "pant":19, "pcon":20, "ppas":21, "ppron12":22, "ppron3":23, "pred":24, "prep":25, "siebie":26, "verb":27, "brev":28,
            "interj":29, "qub":30, "null":31, "na":32}

empty_form = {"subst:nom":0, "subst:gen":0, "subst:acc":0, "subst:dat":0, "subst:inst":0, "subst:loc":0, "subst:voc":0,
            "adj":0, "adja":0, "adjc":0, "adjp":0, "adv":0, "burk":0, "depr":0, "ger":0, "conj":0, "comp":0, "num":0, "pact":0,
            "pant":0, "pcon":0, "ppas":0, "ppron12":0, "ppron3":0, "pred":0, "prep":0, "siebie":0, "verb":0, "brev":0,
            "interj":0, "qub":0, "target":0}


signs = ['.', '(', ')', ';', '"', '[', ']', ',', '?', '!', ':', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
polish = [('ź', 'z'), ('ż', 'z'), ('ą', 'a'), ('ę', 'e'), ('ó', 'o'), ('ł', 'l'), ('ć', 'c'), ('ń', 'n'), ('ś', 's')]

In [9]:
print(signs)

['.', '(', ')', ';', '"', '[', ']', ',', '?', '!', ':', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [10]:
print(polish)

[('ź', 'z'), ('ż', 'z'), ('ą', 'a'), ('ę', 'e'), ('ó', 'o'), ('ł', 'l'), ('ć', 'c'), ('ń', 'n'), ('ś', 's')]


In [11]:
def tokenize(line):
    line2 = []
    line = line.split(' ')
    for base in line:
        base = base.lower()
        for sign in signs:
            base = base.replace(sign, ' ')
        base = base.strip()
        base = base.split(' ')
        if base != '' and base != ['']:
            line2.extend(base)

    return line2


def remove_polish(line):
    line2 = []
    for word in line:
        for sign in polish:
            word = word.replace(sign[0], sign[1])
        line2.append(word)
    return line2

In [12]:
def tokenize_big(line):
    line2 = []
    line = line.split(' ')
    for base in line:
        for sign in signs:
            base = base.replace(sign, ' ')
        base = base.strip()
        base = base.split(' ')
        if base != '' and base != ['']:
            line2.extend(base)

    return line2

In [13]:
def load_polimorph(file='polimorfologik-2.1.txt'):
    dictionary = {}
    with open(file, 'r', encoding='utf8') as base_file:
        for line in base_file:
            line = line.strip().lower()
            line = line.split(";")
            line[2] = line[2].split("+")
            nl = []
            for comp in line[2]:
                spl = comp.split(":")
                if spl[0] != "subst":
                    if spl[0] not in nl:
                        nl.append(spl[0])
                else:
                    if spl[0] + ":" + spl[2] not in nl:
                        nl.append(spl[0] + ":" + spl[2])
            line[2] = nl
            dictionary[line[1]] = (line[0], line[2])

    return dictionary


def create_casts(base_poli):
    dictionary = {}
    for key in base_poli:
        weak_key = remove_polish([key])[0]
        if weak_key not in dictionary:
            dictionary[weak_key] = []
        dictionary[weak_key].append(key)

    return dictionary

In [14]:
def load_polimorph2(file='polimorfologik-2.1.txt'):
    dictionary = {}
    with open(file, 'r', encoding='utf8') as base_file:
        for line in base_file:
            line = line.strip()
            line = line.split(";")
            if line[1].lower() != line[1]:
                dictionary[line[1].lower()] = line[1]

    return dictionary

In [15]:
def load_unigrams(file='1grams'):
    dictionary = {}
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            line = line.strip().lower()
            line = line.split(' ')
            dictionary[line[1]] = int(line[0])

    return dictionary


def load_2grams(file='2grams', k=5):
    dictionary = {}
    i = 0
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            line = line.strip().lower()
            line = line.split(' ')
            if int(line[0]) >= k:
                if line[1] not in dictionary:
                    dictionary[line[1]] = {}
                dictionary[line[1]][line[2]] = int(line[0])
            else:
                break
            i += 1

    return dictionary


def load_3grams(file='3grams', k=5):
    dictionary = {}
    i = 0
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            line = line.strip().lower()
            line = line.split(' ')
            if int(line[0]) >= k:
                if line[1] not in dictionary:
                    dictionary[line[1]] = {}
                dictionary[line[1]][line[2:]] = int(line[0])
            else:
                break
            i += 1

    return dictionary

In [16]:
def load_set(file='train_shuf.txt', k=10000):
    i = 0
    lines = []
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            if i == k:
                break
            lines.append(tokenize(line))
            i += 1

    return lines


def divide_set(total_set, k=0.7):
    s = round(len(total_set)*k)
    return total_set[:s], total_set[s:]

In [17]:
def load_big_set(file='train_shuf.txt', k=10000):
    i = 0
    lines = []
    with open(file, 'r', encoding='utf8') as base_vectors_lines:
        for line in base_vectors_lines:
            if i == k:
                break
            lines.append(tokenize_big(line))
            i += 1

    return lines

In [18]:
def flatten(listt):
        a = []
        for itemm in listt:
            if isinstance(itemm, list):
                a += flatten(itemm)
            else:
                a.append(itemm)
        return a

def combine(lines):
    conc = lines[0]
    if len(lines) == 1:
        conc = [conc]
    for part in lines[1:]:
        conc = list(map(list, itertools.product(conc, part)))
    conc2 = []
    for item in conc:
        conc2.append(flatten(item))
    return conc2

def permute(line, casts):
    line2 = []
    for word in line:
        if word in casts:
            line2.append(casts[word])
        else:
            line2.append([word])

    return combine(line2)

In [19]:
combine([[None, "a"], [None]])

[[None, None], ['a', None]]

In [20]:
def wrong(line, casts):
    if line not in permute(line, casts):
        return True
    return False

In [21]:
def windows(line, k=3):
    
    line2 = []
    line2.append(None)
    line2 += line
    line2.append(None)
    line = line2
    
    if len(line) < k:
        line2 = []
        for i in range(math.ceil((k-len(line))/2)):
            line2.append(None)
        line2 += line
        for i in range(math.floor((k-len(line))/2)):
            line2.append(None)
        line = line2
        
    if len(line) == k:
        return [line]
    
    else:
        lines = []
        line2 = line[0:k]
        lines.append(line2.copy())
        for word in line[k:]:
            del line2[0]
            line2.append(word)
            lines.append(line2.copy())
        return lines

In [22]:
def create_dgrams(training_set, casts):
    dgrams = {}
    for line in training_set:
        if len(line) >= 2:
            fst = line[0]
            for word in line[1:]:
                snd = word
                if not wrong([fst, snd], casts):
                    if fst not in dgrams:
                        dgrams[fst] = {}
                    if snd not in dgrams[fst]:
                        dgrams[fst][snd] = 0
                    dgrams[fst][snd] += 1
                fst = snd
    
    return dgrams

In [23]:
def create_database(training_set, poli, casts, digrams1, digrams2):
    # df = pd.DataFrame(data=raw_form)
    # duos = pd.DataFrame(data={"fst":[], "snd":[], "target":[]})
    # trios = pd.DataFrame(data={"fst":[], "snd":[], "trd":[], "target":[]})
    trios = []
    j = -1
    for base_line in training_set:
        j += 1
        if j % 10000 == 0:
            print(j)

        for line in windows(base_line, k=3):
            if not wrong(line, casts):
                for perm in permute(line, casts):
                    trio = {"fst":0, "snd":0, "trd":0, "lgram":0, "rgram":0, "target":0}
                    
                    if line == perm:
                        trio["target"] = "y"
                    else:
                        trio["target"] = "n"
                        
                    #if perm[0] in digrams1:
                    #    if perm[1] in digrams1[perm[0]]:
                    #        trio["lgram"] = "y"
                    #if perm[0] in digrams2:
                    #    if perm[1] in digrams2[perm[0]]:
                    #        trio["lgram"] = "y"
                    #if perm[1] in digrams1:
                    #    if perm[2] in digrams1[perm[1]]:
                    #        trio["rgram"] = "y"
                    #if perm[1] in digrams2:
                    #    if perm[2] in digrams2[perm[1]]:
                    #        trio["rgram"] = "y"
                    
                    pres = 0
                    size = 1
                    if perm[0] in digrams1:
                        if perm[1] in digrams1[perm[0]]:
                            pres += digrams1[perm[0]][perm[1]]
                        size += digrams1[perm[0]][0]
                    if perm[0] in digrams2:
                        if perm[1] in digrams2[perm[0]]:
                            pres += digrams2[perm[0]][perm[1]]
                        size += digrams2[perm[0]][0]
                    sc1 = (pres/size) * math.log(size)
                    trio["lgram"] = round(sc1*100)/100
                    
                    pres = 0
                    size = 1
                    if perm[1] in digrams1:
                        if perm[2] in digrams1[perm[1]]:
                            pres += digrams1[perm[1]][perm[2]]
                        size += digrams1[perm[1]][0]
                    if perm[1] in digrams2:
                        if perm[2] in digrams2[perm[1]]:
                            pres += digrams2[perm[1]][perm[2]]
                        size += digrams2[perm[1]][0]
                    sc1 = (pres/size) * math.log(size)
                    trio["rgram"] = round(sc1*100)/100
                    
                    form = []
                    bad = False
                    for word in perm:
                        if word in poli:
                            form.append(poli[word][1])
                        else:
                            if word == None:
                                form.append(["null"])
                            else:
                                form.append(["na"])
                    
                    if not bad:
                        form = combine(form)
                        for comb in form:
                            trio2 = trio.copy()
                            trio2["fst"] = comb[0]
                            trio2["snd"] = comb[1]
                            trio2["trd"] = comb[2]
                            trios.append(trio2)

    triosdt = pd.DataFrame(trios)
    y = []
    x = []
    for trio in trios:
        y.append(trio["target"])
        del trio["target"]
        x.append([])
        for key in trio:
            if key == "fst" or key == "snd" or key == "trd":
                x[-1].append(raw_form[trio[key]])
            else:
                x[-1].append(trio[key])
    
    return triosdt, x, y

In [24]:
def find_big(training_set):
    dictionary = {}
    for line in training_set:
        if len(line) >= 2:
            for word in line[1:]:
                if len(word) > 0:
                    if word[0].lower() != word[0]:
                        if word.lower() not in dictionary:
                            dictionary[word.lower()] = [word, 0, 0]
                        dictionary[word.lower()][1] += 1
                        dictionary[word.lower()][2] += 1
                    else:
                        if word in dictionary:
                            dictionary[word][2] += 1
    
    return dictionary

In [25]:
def count_digrams(digrams):
    for fst in digrams:
        count = 0
        for snd in digrams[fst]:
            count += digrams[fst][snd]
        digrams[fst][0] = count
    return digrams

In [26]:
def fix_polish2(phrase, casts, digrams1, digrams2):
    perms = []
    i = 0
    for line in windows(phrase, k=2):
        perms.append([])
        for perm in permute(line, casts):
            pres = 0
            size = 1
            if perm[0] in digrams1:
                if perm[1] in digrams1[perm[0]]:
                    pres += digrams1[perm[0]][perm[1]]
                size += digrams1[perm[0]][0]
            if perm[0] in digrams2:
                if perm[1] in digrams2[perm[0]]:
                    pres += digrams2[perm[0]][perm[1]]
                size += digrams2[perm[0]][0]
            score = (pres/size) * math.log(size)
            perms[i].append((perm,score))
        i += 1
    
    output = []
    prev = None
    for i in range(len(perms)):
        mx = -1
        mperm = None
        for perm, sc in perms[i]:
            if perm[0] == prev:
                if sc > mx:
                    mx = sc
                    mperm = perm
        output.append(mperm[1])
        prev = mperm[1]           
    
    return output

In [27]:
def fix_polish(phrase, poli, casts, digrams1, digrams2, main_tree):
    ans = {}
    for i in range(len(phrase)+2):
        ans[i] = []
    
    perms = []
    i = 0
    for line in windows(phrase, k=3):
        perms.append([])
        i += 1
        mx = 0
        mn = 0
        mxperm = line
        for perm in permute(line, casts):          
            entry = {"fst":0, "snd":0, "trd":0, "lgram":"n", "rgram":"n"}
            #if perm[0] in digrams1:
            #    if perm[1] in digrams1[perm[0]]:
            #        entry["lgram"] = "y"
            #if perm[0] in digrams2:
            #    if perm[1] in digrams2[perm[0]]:
            #        entry["lgram"] = "y"
            #if perm[1] in digrams1:
            #    if perm[2] in digrams1[perm[1]]:
            #        entry["rgram"] = "y"
            #if perm[1] in digrams2:
            #    if perm[2] in digrams2[perm[1]]:
            #        entry["rgram"] = "y"
            pres = 0
            size = 1
            if perm[0] in digrams1:
                if perm[1] in digrams1[perm[0]]:
                    pres += digrams1[perm[0]][perm[1]]
                size += digrams1[perm[0]][0]
            if perm[0] in digrams2:
                if perm[1] in digrams2[perm[0]]:
                    pres += digrams2[perm[0]][perm[1]]
                size += digrams2[perm[0]][0]
            sc1 = (pres/size) * math.log(size)
            entry["lgram"] = round(sc1*100)/100   

            pres = 0
            size = 1
            if perm[1] in digrams1:
                if perm[2] in digrams1[perm[1]]:
                    pres += digrams1[perm[1]][perm[2]]
                size += digrams1[perm[1]][0]
            if perm[1] in digrams2:
                if perm[2] in digrams2[perm[1]]:
                    pres += digrams2[perm[1]][perm[2]]
                size += digrams2[perm[1]][0]
            sc1 = (pres/size) * math.log(size)
            entry["rgram"] = round(sc1*100)/100 
                    
            forms = []
            for word in perm:
                if word in poli:
                    forms.append(poli[word][1])
                else:
                    if word == None:
                        forms.append(["null"])
                    else:
                        forms.append(["na"])

            forms = combine(forms)
            
            #mn1 = 1
            #mx1 = 0
            ttl = 0
            for form in forms:
                entry["fst"] = form[0]
                entry["snd"] = form[1]
                entry["trd"] = form[2]
                sc1 = main_tree.classify(entry)
                #print(perm)
                #print(entry, score)
                ttl += sc1*sc1
                #if sc1 > mx1:
                #    mx1 = sc1
                #if sc1 < mn1:
                #    mn1 = sc1
            #if 0.5 - mn1 >= mx1 - 0.5:
            #    entry["trio"] = mn1
            #else:
            #    entry["trio"] = mx1
            sc1 = ttl/len(forms)
            perms[i-1].append((perm, sc1))
            if sc1 > mx:
                mx = sc1
                mperm = perm
                

        #ans[i-1].append((mperm[0], mx))
        #ans[i].append((mperm[1], mx))
        #ans[i+1].append((mperm[2], mx))

    output = []
    prev = None
    for i in range(len(perms)):
        mx = -1
        mperm = None
        for perm, sc in perms[i]:
            if perm[0] == prev:
                if sc > mx:
                    mx = sc
                    mperm = perm
        output.append(mperm[1])
        prev = mperm[1]
    
    return output

In [28]:
def fix_polishF(phrase, poli, casts, digrams1, digrams2, main_tree):
    ans = {}
    for i in range(len(phrase)+2):
        ans[i] = []
    
    perms = []
    i = 0
    for line in windows(phrase, k=3):
        perms.append([])
        i += 1
        mx = 0
        mn = 0
        mxperm = line
        for perm in permute(line, casts):          
            entry = {"fst":0, "snd":0, "trd":0, "lgram":"n", "rgram":"n"}
            
            pres = 0
            size = 1
            if perm[0] in digrams1:
                if perm[1] in digrams1[perm[0]]:
                    pres += digrams1[perm[0]][perm[1]]
                size += digrams1[perm[0]][0]
            if perm[0] in digrams2:
                if perm[1] in digrams2[perm[0]]:
                    pres += digrams2[perm[0]][perm[1]]
                size += digrams2[perm[0]][0]
            sc1 = (pres/size) * math.log(size)
            entry["lgram"] = round(sc1*100)/100   

            pres = 0
            size = 1
            if perm[1] in digrams1:
                if perm[2] in digrams1[perm[1]]:
                    pres += digrams1[perm[1]][perm[2]]
                size += digrams1[perm[1]][0]
            if perm[1] in digrams2:
                if perm[2] in digrams2[perm[1]]:
                    pres += digrams2[perm[1]][perm[2]]
                size += digrams2[perm[1]][0]
            sc1 = (pres/size) * math.log(size)
            entry["rgram"] = round(sc1*100)/100 
                    
            forms = []
            for word in perm:
                if word in poli:
                    forms.append(poli[word][1])
                else:
                    if word == None:
                        forms.append(["null"])
                    else:
                        forms.append(["na"])

            forms = combine(forms)
            
            ttl = 0
            for form in forms:
                entry["fst"] = form[0]
                entry["snd"] = form[1]
                entry["trd"] = form[2]
                fentry = []
                for key in entry:
                    if key == "fst" or key == "snd" or key == "trd":
                        fentry.append(raw_form[entry[key]])
                    else:
                        fentry.append(entry[key])
                sc1 = main_tree.predict_proba([fentry])[0][1]
                ttl += sc1*sc1

            sc1 = ttl/len(forms)
            perms[i-1].append((perm, sc1))
            if sc1 > mx:
                mx = sc1
                mperm = perm

    output = []
    prev = None
    for i in range(len(perms)):
        mx = -1
        mperm = None
        for perm, sc in perms[i]:
            if perm[0] == prev:
                if sc > mx:
                    mx = sc
                    mperm = perm
        output.append(mperm[1])
        prev = mperm[1]
    
    return output

In [29]:
def fix_case(phrase, bigs1, bigs2):
    ww = phrase[0]
    fixed = []
    fixed.append(ww.capitalize())
    if len(phrase) >= 2:
        for word in phrase[1:]:
            if word in bigs1 and (word not in bigs2 or (word in bigs2 and bigs2[word][1] / bigs2[word][2] >= 0.5)):
                fixed.append(bigs1[word])
            else:
                if word in bigs2 and bigs2[word][1] / bigs2[word][2] > 0.5:
                    fixed.append(bigs2[word][0])
                else:
                    fixed.append(word)
    
    return fixed

In [30]:
def score(phrase1, phrase2):
    s = 0
    for i in range(len(phrase1)):
        if phrase1[i] == phrase2[i]:
            s += 1

    return s/len(phrase1)

In [31]:
Gpoli = load_polimorph()
Gcasts = create_casts(Gpoli)
#unigramsS = load_unigrams()
Gdigrams1 = load_2grams(k=3)
#trigramsS = load_3grams()

In [32]:
Gbig = load_polimorph2()

Gtotal_set = load_big_set(k=100000)
Gbig2 = find_big(Gtotal_set)
del Gtotal_set

In [33]:
Gtotal_set = load_set(k=100000)
Gdigrams2 = create_dgrams(Gtotal_set, Gcasts)
del Gtotal_set

In [37]:
print(create_database(load_set(k=3), Gpoli, Gcasts, Gdigrams1, Gdigrams2)[0])

0
           fst        snd   trd  lgram  rgram target
0         null  subst:gen    na   0.00   0.00      y
1    subst:gen         na  verb   0.00   0.35      y
2           na       verb  verb   0.35   0.00      y
3         verb       verb   adv   0.00   0.07      y
4         verb       verb   num   0.00   0.07      y
..         ...        ...   ...    ...    ...    ...
247  subst:gen       prep  verb   0.44   0.00      y
248     interj        ger  null   0.00   0.00      y
249     interj       verb  null   0.00   0.00      y
250       prep        ger  null   0.00   0.00      y
251       prep       verb  null   0.00   0.00      y

[252 rows x 6 columns]


In [26]:
Gvalidation_set = load_set(k=1200000)[1000000:]
Gvalidation_set2 = load_big_set(k=1200000)[1000000:]

In [34]:
Gdigrams1 = count_digrams(Gdigrams1)
Gdigrams2 = count_digrams(Gdigrams2)

In [36]:
print(Gdigrams1["podwójnej"])

{'roli': 46, 'wysokości.': 28, 'linii': 22, 'wysokości': 22, 'roli:': 21, 'ciągłej': 18, 'wagi': 17, 'większości': 16, 'roli.': 15, 'wartości': 13, 'roli,': 12, 'spirali': 12, 'moralności,': 11, 'helisy': 9, 'i': 9, 'moralności': 9, 'w': 9, 'moralności.': 8, 'składki': 8, 'z': 8, 'ilości': 7, 'kobiet': 7, 'przewadze,': 7, 'wysokości,': 7, 'helisy,': 6, 'para': 6, 'przewadze': 6, 'ulgi': 6, '-': 5, 'lojalności.': 5, 'warstwy': 5, 'większości,': 5, 'większości.': 5, 'helisy.': 4, 'koronie.': 4, 'monarchii.': 4, 'natury.': 4, 'negacji.': 4, 'przewadze.': 4, 'tożsamości': 4, 'wygranej': 4, 'ciągłej.': 3, 'dokumentacji': 3, 'dziewcząt': 3, 'gry,': 3, 'ilości.': 3, 'juniorek,': 3, 'juniorów': 3, 'katalizy': 3, 'kobiet:': 3, 'korony': 3, 'lojalności,': 3, 'lub': 3, 'mariusz': 3, 'miary': 3, 'miary,': 3, 'moralności:': 3, 'odpowiedzialności': 3, 'opłaty': 3, 'pętli': 3, 'platynowej': 3, 'podłodze': 3, 'postaci': 3, 'razem': 3, 'składki.': 3, 'stawki': 3, 'szerokości': 3, 'ściany': 3, 'tożsamoś

In [221]:
Gtrain_set = load_set(k=1000)
print(len(Gtrain_set))

Gdatabase, _, _ = create_database(Gtrain_set, Gpoli, Gcasts, Gdigrams1, Gdigrams2)
del Gtrain_set

Gmain_tree = Tree(Gdatabase)
del Gdatabase

1000
0


In [45]:
Gtrain_set = load_set(k=200000)

_, X, y = create_database(Gtrain_set, Gpoli, Gcasts, Gdigrams1, {})
del Gtrain_set

Gmain_tree2 = RandomForestClassifier(max_depth=4, random_state=0, criterion="entropy", n_jobs=6)
Gmain_tree2.fit(X, y)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


RandomForestClassifier(criterion='entropy', max_depth=4, n_jobs=6,
                       random_state=0)

In [259]:
Gtrain_set = load_set(k=2)

_, X, y = create_database(Gtrain_set, Gpoli, Gcasts, Gdigrams1, Gdigrams2)
del Gtrain_set

Gmain_tree2.predict_proba(X[0:5])

0


array([[0.2162131 , 0.7837869 ],
       [0.1489218 , 0.8510782 ],
       [0.16304982, 0.83695018],
       [0.18159669, 0.81840331],
       [0.18163694, 0.81836306]])

In [109]:
Gtrain_set = load_set(k=120000)[100000:]
trios2, _, _ = create_database(Gtrain_set, Gpoli, Gcasts, Gdigrams1, Gdigrams2)
del Gtrain_set
Gmain_tree.start_prune(trios2)
del trios2

0
10000


In [195]:
Gmain_tree.draw().render('test-output/database_tree.gv', view=False)

'test-output\\database_tree.gv.pdf'

In [192]:
line = load_set(k=3)[1]
print(fix_polish(remove_polish(line), Gpoli, Gcasts, Gdigrams1, Gdigrams2, Gmain_tree))
print(line)
print(remove_polish(line))
print(score(line, fix_polish(remove_polish(line), Gpoli, Gcasts, Gdigrams1, Gdigrams2, Gmain_tree)))

['parlament', 'zdecydował', 'jednak', 'inaczej', 'i', 'przyjął', 'w', 'ustawie', 'z', 'dnia', 'r', 'jednoinstancyjne', 'postepowanie', 'orzeczniczo-lekarskie']
['parlament', 'zdecydował', 'jednak', 'inaczej', 'i', 'przyjął', 'w', 'ustawie', 'z', 'dnia', 'r', 'jednoinstancyjne', 'postępowanie', 'orzeczniczo-lekarskie']
['parlament', 'zdecydowal', 'jednak', 'inaczej', 'i', 'przyjal', 'w', 'ustawie', 'z', 'dnia', 'r', 'jednoinstancyjne', 'postepowanie', 'orzeczniczo-lekarskie']
0.9285714285714286


In [222]:
#k=20000, base decision tree
k = 100
total = 0
for i in range(k):
    line = Gvalidation_set[i]
    line2 = Gvalidation_set2[i]
    broken_line = remove_polish(line)
    fixed_line = fix_polish(broken_line, Gpoli, Gcasts, Gdigrams1, Gdigrams2, Gmain_tree)
    sc1 = score(line, fixed_line)
    fixed_line2 = fix_case(fixed_line, Gbig, Gbig2)
    sc2 = score(line2, fixed_line2)
    total += math.sqrt(sc1*sc1)

print(total/k)

200000
0.9346142255902423


In [47]:
#k=100000, random forest
k = 1000
total = 0
for i in range(k):
    line = Gvalidation_set[i]
    line2 = Gvalidation_set2[i]
    broken_line = remove_polish(line)
    fixed_line = fix_polishF(broken_line, Gpoli, Gcasts, Gdigrams1, Gdigrams2, Gmain_tree2)
    sc1 = score(line, fixed_line)
    fixed_line2 = fix_case(fixed_line, Gbig, Gbig2)
    sc2 = score(line2, fixed_line2)
    total += math.sqrt(sc1*sc2)

print(total/k)

0.9625769088763515


In [48]:
#just use grams lol
k = 1000
total = 0
for i in range(k):
    line = Gvalidation_set[i]
    line2 = Gvalidation_set2[i]
    broken_line = remove_polish(line)
    fixed_line = fix_polish2(broken_line, Gcasts, Gdigrams1, Gdigrams2)
    sc1 = score(line, fixed_line)
    fixed_line2 = fix_case(fixed_line, Gbig, Gbig2)
    sc2 = score(line2, fixed_line2)
    total += math.sqrt(sc1*sc2)

print(total/k)

0.9687744539404973


In [101]:
print(Gbig2["w"])

['W', 3077, 607097]


In [112]:
print(Gdigrams1["pij"])

{'i': 21, 'dużo': 17, 'na': 16, '-': 12, 'za': 11, 'z': 8, 'tak': 7, 'alkoholu': 6, 'do': 6, 'wodę': 6, 'już': 5, 'mleko': 5, 'mleko,': 5, 'pij': 5, 'w': 5}


In [118]:
print(Gdigrams2["pij"])

{'świeżo': 1, 'wodę': 1, 'piotrek-elektryczne': 1, 'tylko': 1, 'nych': 1, 'skim': 1, 'i': 1, 'na': 1}


In [119]:
print(Gdigrams2["pij"])

{'świeżo': 1, 'wodę': 1, 'piotrek-elektryczne': 1, 'tylko': 1, 'nych': 1, 'skim': 1, 'i': 1, 'na': 1}


In [171]:
print(Gcasts["jesli"])


['jeśli']
