In [3]:
import csv
import re, collections
from collections import Counter, defaultdict

In [361]:
def read_input(path):
    lemma = []
    tag = []
    with open(path, encoding="utf8") as file:
        reader = csv.reader(file)
        for row in reader:
            #splitting the data by tabs
            r = re.split(r'\t+', row[0])
            #appending lemma
            lemma.append(r[0])
            #splitting tags
            t = r[2].split(';')
            #appending only the first tag
            tag.append(t[0])
    return lemma, tag
        

In [364]:
INVAR_POS = {'V': {'V', 'V.PTCP', 'V.CVB'}, 'N': {'N'}, 'ADJ': {'ADJ'}}
MORBOUND = '^'

def parts_of_speech(lemma, tag):
    
    assert len (lemma) == len (tag)
    pos_lemma = {} # pos: {lemmas}
    unique_words = set([])
    
    #splitting lemmas into parts of speech
    for i in range(len(tag)):
        cat = [x for x in INVAR_POS for y in INVAR_POS[x] if y == tag[i]][0]
        pos_lemma[cat] = pos_lemma.get(cat, set([]))
        pos_lemma[cat].add (lemma[i])
        unique_words.add((lemma[i], cat))
    
    # print the result
    print("POSes in dataset:", set(tag))
    print ("Num unique (word,pos):", len(unique_words), "/", len(lemma))
    for x in pos_lemma:
        print (x + '\t\t', end='')
    print ('\n', end = '')
    for x in pos_lemma:
        print (len (pos_lemma[x]), '\t\t', end = '')
    print ('\n', end = '')
    for i in range(10):
        for x in pos_lemma:
            print (list(pos_lemma[x])[i] + '\t',end = '')
        print ('\n', end = '')
    
    return pos_lemma



In [365]:
#path = "../../bpe_data/russian-train-high.txt"
path = "../../bpe_data/portuguese-train-high.txt"
lemmas, poses = read_input(path)

pos_lemma = parts_of_speech(lemmas, poses)
#print (len(adj), len(v), len(n), list_of_numbers[:10])
#pos_lemma


POSes in dataset: {'V', 'V.PTCP'}
Num unique (word,pos): 3723 / 10000
V		
3723 		
aligeirar	
ondear	
descercar	
automatizar	
relaxar	
aquartelar	
desanimar	
lotear	
arrazoar	
antecipar	


In [366]:
def test(word, suffs):
    for suff in suffs:
        if word[len(word)-len(suff):] == suff:
            return True
    return False

def bpe(list_of_lemmas, percentage, abs_min, subcoverage, total_n):
    counts = {}
    for word in list_of_lemmas:
        if (len(word)) < 2:
            continue
        comb = word[-2] + word[-1]
        counts[comb] = counts.get(comb, 0) + 1
    
    counts = [(x[0], x[1], x[1] / len(list_of_lemmas), x[1]/total_n) for x in counts.items()]
    counts = sorted(counts, key = lambda x: -x[1])
    bests = list(filter(lambda x: x[1] > abs_min and x[2] > percentage, counts))
    #print (counts)
    #print (bests,  sum([x[1] for x in counts if x[2] <= percentage]), sum([x[2] for x in counts if x[2] <= percentage]))
    #print ('------')
    
    
    if len(bests) == 0:
        return []
    #ret = [b for b in bests]
    ret = []
    for x in bests:
        new_input = [word[:-2] + [x[0]] for word in list_of_lemmas if word[-2] + word[-1] == x[0]]
        daughters = bpe(new_input, percentage, abs_min, subcoverage,total_n)
        
        perc_covered_by_daughters = len([word for word in new_input if test(''.join(word), [d[0] for d in daughters])]) / len(new_input)
        #print ('daughters', perc_covered_by_daughters, daughters, x)
        
        # NOTE: if testing for good subpartition (else reject daughters), do:
#         if perc_covered_by_daughters <= subcoverage:
#             ret += [tuple(list(x) + [perc_covered_by_daughters])]
#         else:
#             ret += daughters
        # OTHERWISE:
        ret += daughters
        if perc_covered_by_daughters <= subcoverage:
           ret += [tuple(list(x) + [perc_covered_by_daughters])]
        
    if  sum([x[2] for x in counts if x[2] <= percentage]) > percentage:
        ret += bpe([word for word in list_of_lemmas if word[-2] + word[-1] not in set([b[0] for b in bests])], percentage, abs_min, subcoverage,total_n)
    
    return ret


class BpeModel(object):
    
    def __init__(self, pos, percentage = 0.1, abs_min = 100, subcoverage = 0.8):
        self.pos = pos
        self.suffixes = {'ab'}
        self.percentage = percentage
        self.abs_min = 100
        self.subcoverage = subcoverage
        
    
    def fit(self, set_of_lemmas):
        vocab = [list(lem) + [''] for lem in set_of_lemmas]
        self.suffixes = bpe(vocab, self.percentage, self.abs_min, self.subcoverage,len(set_of_lemmas))
    
    def transform(self,word):
        res = word + MORBOUND
        for suff in self.suffixes:
            if word[len(word)-len(suff):] == suff:
                res = word[:len(word)-len(suff)] + MORBOUND + suff
        return res

    

bpe_model = BpeModel('V',percentage=.12,subcoverage=0.75)
bpe_model.fit(pos_lemma['V'])
bpe_model.suffixes

#bpe_model = BpeModel('ADJ')
#bpe_model.fit(pos_lemma['ADJ'])
#bpe_model.suffixes

[('ntar', 107, 0.22061855670103092, 0.028740263228579102, 0.0),
 ('tar', 485, 0.15645161290322582, 0.13027128659683052, 0.22061855670103092),
 ('rar', 324, 0.12390057361376673, 0.08702659145850121, 0.0),
 ('ar', 3100, 0.83266183185603, 0.83266183185603, 0.2609677419354839),
 ('cer', 111, 0.3447204968944099, 0.029814665592264304, 0.0),
 ('er', 322, 0.5168539325842697, 0.08648939027665861, 0.3447204968944099),
 ('ir', 280, 0.449438202247191, 0.07520816545796401, 0.0)]

In [120]:
# ADJ:
#     ый
#     ий
#     ой
# N:
#     а
#     о
#     ø
#     ь
#     е
#     я
#     ия
#     ие
# V:
#     ать
#     ить
#     ять    ся
#     еть
#     уть
#     ти   сь

NameError: name 'a' is not defined