In [1]:
import csv
import re, collections
from collections import Counter, defaultdict

In [2]:
def reading_file(path):
    lemma = []
    tag = []
    with open(path, encoding="utf8") as file:
        reader = csv.reader(file)
        for row in reader:
            #splitting the data by tabs
            r = re.split(r'\t+', row[0])
            #appending lemma
            lemma.append(r[0])
            #splitting tags
            t = r[2].split(';')
            #appending only the first tag
            tag.append(t[0])
    return lemma, tag
        
    
    
def unique(list1): 
  
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
    return unique_list


def parts_of_speech(lemma, tag):
    adj = []
    adj1 = []
    v = []
    v1 = [] 
    n = []
    n1 = [] 

    #splitting lemmas into parts of speech
    for i in range(len(tag)):
        if (tag[i] == 'ADJ'):
            adj.append(lemma[i])
            adj1.append(i)
        if (tag[i] == 'V' or tag[i] == 'V.PTCP' or tag[i] == 'V.CVB'):
            v.append(lemma[i])
            v1.append(i)
        if (tag[i] == 'N'):
            n.append(lemma[i])
            n1.append(i)
    
    list_of_num = adj1 + v1 + n1
    
    return adj, v, n, list_of_num


def build_vocab(corpus: str) -> dict:
    # Separate each char in word by space 
    tokens = [" ".join(word) for word in corpus]
    
    # Count frequency of tokens in corpus
    vocab = Counter(tokens)  
    
    i = 0
    order = dict()
    #this chunk of code is needed to order the corpus appropriately after the stemming
    #tokens: ['w r i t e', 'r e a d', 'd r i n k', 'w r i t e'] this is the order that should maintaing
    #counter: {('w r i t e': 2), ('r e a d' : 1), ('d r i n k': 1)}
    #order: {('w r i t e': [0, 3]), ('r e a d': [1]), ('d r i n k': [2])}
    for word, count in vocab.items():
        for j in range(len(tokens)):
            if tokens[j] == word:
                if (word in order):
                    order[word].append(j) 
                else:
                    order[word] = [j]
        i = i + 1
    
        

    return vocab, order

#create byte pairs
def get_stats(vocab):
    pairs = collections.defaultdict(int) 
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq 
    return pairs

#merge the byte pairs which have maximum count
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') 
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word] 
    return v_out

#removes spacing between the stems, if ending is in the pairs, then it is not added
#some examples have 2 or more words, so, it also handles the spaces between the words within one example
def removing_spaces(parts, endings):
    
    parts = parts.split(' ')
    p = []
    #if there is only one word
    if ('' not in parts):
        #appends everything except ending
        if (parts[-1] in endings):
            p =''.join(parts[:-1])
        else:
            p =''.join(parts)
    else:
        # if more than one word
        first = True
        for i, char in enumerate(parts): 
            if ( char == ''):
                if(first):
                    p = ''.join(parts[:i])
                    first = False
                    space = i
                p = p + ' ' + ''.join(parts[space:i])
                space = i
        if(parts[-1] in endings):
            p = p + ' ' + ''.join(parts[space:-1])
        else:
            p = p + ' ' + ''.join(parts[space::])
    
    return p 


def bpe(words, num_merges):
    
    new_lem = [None] * len(words)
    vocab, order = build_vocab(words) 
    endings = []
    
    for i in range(num_merges):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get) 
        vocab = merge_vocab(best, vocab)
        endings.append(''.join(best))
        print(best)
    
    #l = list of vocab with connected endings, for example:  'l o w er' 
    l = list(vocab.keys())
    lem = []
    # lem is the list of lems, for example: 'low'
    for parts in l:
        new_parts = removing_spaces(parts, endings)
        lem.append(new_parts)
    
    #preserves the order of the words within one part of the speech
    k = 0
    for word, orde in order.items():
        if (len(orde) == 1):
            new_lem[orde[0]] = lem[k]
        else:
            for num in orde:
                new_lem[num] = lem[k]
        k = k + 1

    
    return new_lem


In [7]:
num_merges = 5
#path = "/home/aidana/russian_lemma.txt"
path = "/home/aidana/turkish.tex"
lemma, tag = reading_file(path)
print(unique(tag))
adj, v, n, list_of_numbers = parts_of_speech(lemma, tag)

#it seems like for turkish and russian, it is better not to apply bpe on n
unsorted_list_of_lemmas = bpe(adj, num_merges) + bpe(v, num_merges) + n
#for preserving the original order 
final_list_of_lemmas = [x for _,x in sorted(zip(list_of_numbers, unsorted_list_of_lemmas))]




['N', 'V', 'ADJ']
('l', 'i')
('l', 'ı')
('b', 'a')
('e', 'm')
('s', 'i')
('a', 'k')
('m', 'ak')
('e', 'k')
('m', 'ek')
('ı', 'r')


In [8]:
final_list_of_lemmas

['domates',
 'gelin',
 'derleyici',
 'kimyasal tepkime',
 'kısalık',
 'cımbız',
 'çatal',
 'kaçır',
 'mikrop',
 'ropdöşambır',
 'gerin',
 'savun',
 'tümce',
 'var',
 'makine dili',
 'çal',
 'tutuştur',
 'dalgıç',
 'abajurcu',
 'var',
 'gösteri',
 'asteroit',
 'devir',
 'kaçak',
 'vernik',
 'ayna',
 'müge',
 'aktris',
 'böl',
 'bükmeme',
 'okul',
 'sözcük',
 'fikoloji',
 'kurut',
 'hayırsever',
 'silgi',
 'pelte',
 'fren',
 'kartpostal',
 'çoklu kalıtım',
 'makina',
 'yoğurtçuluk',
 'konser',
 'yalancı',
 'rakam',
 'işkenceci',
 'genel kültür',
 'kireç',
 'anket',
 'aç',
 'utan',
 'kromozom',
 'imam',
 'klatrat',
 'girdap',
 'büyücü',
 'yoğurt',
 'yaptırt',
 'disko',
 'boz ayı',
 'fibrinojen',
 'optik karakter tanıma',
 'ayakkabıcı',
 'kırdır',
 'plaj voleybolu',
 'paskalya',
 'istiridye',
 'kalpak',
 'kaside',
 'hamız',
 'örnek',
 'laço',
 'cimcime',
 'gizle',
 'sevdalinka',
 'aç',
 'düşes',
 'çoğul',
 'küre',
 'sululuk',
 'canlandırma',
 'gecelik',
 'puan',
 'diş',
 'ibnelik',
 'rapor

In [6]:
lemma

['domates',
 'gelin',
 'derleyici',
 'kimyasal tepkime',
 'kısalık',
 'cımbız',
 'çatal',
 'kaçırmak',
 'mikrop',
 'ropdöşambır',
 'gerinmek',
 'savunmak',
 'tümce',
 'varmak',
 'makine dili',
 'çalmak',
 'tutuşturmak',
 'dalgıç',
 'abajurcu',
 'varmak',
 'gösteri',
 'asteroit',
 'devir',
 'kaçak',
 'vernik',
 'ayna',
 'müge',
 'aktris',
 'bölmek',
 'bükmeme',
 'okul',
 'sözcük',
 'fikoloji',
 'kurutmak',
 'hayırsever',
 'silgi',
 'pelte',
 'fren',
 'kartpostal',
 'çoklu kalıtım',
 'makina',
 'yoğurtçuluk',
 'konser',
 'yalancı',
 'rakam',
 'işkenceci',
 'genel kültür',
 'kireç',
 'anket',
 'açmak',
 'utanmak',
 'kromozom',
 'imam',
 'klatrat',
 'girdap',
 'büyücü',
 'yoğurt',
 'yaptırtmak',
 'disko',
 'boz ayı',
 'fibrinojen',
 'optik karakter tanıma',
 'ayakkabıcı',
 'kırdırmak',
 'plaj voleybolu',
 'paskalya',
 'istiridye',
 'kalpak',
 'kaside',
 'hamız',
 'örnek',
 'laço',
 'cimcime',
 'gizlemek',
 'sevdalinka',
 'açmak',
 'düşes',
 'çoğul',
 'küremek',
 'sululuk',
 'canlandırma',
