## Universal Dependecies Data

From this point, I will switch to the Hungarian language and use a system of Neural Networks to predict the word type and the morphemes of a given word.

In this Notebook, I'm loading the `hu_szeged-ud-train.conllu` Universal Dependencies Data as an input for the upcoming Neural Networks.

However, it turned out that this amount of data wasn't sufficient (only about 20,000 entries), so this Notebook mostly exists to show my trials and first attempts at the problem.

It also contains an attempt at a custom word2vec model, which I didn't end up using.

In [6]:
import numpy as np
import math
import pandas as pd

In [2]:
data = []
with open('../DATA/train/hu_szeged-ud-train.conllu', 'r', encoding='utf-8') as f:
    data = f.read()

In [3]:
rows = data.split('\n')

In [4]:
rows

['# sent_id = train-1',
 '# text = A világban immár tíz éve tartó folyamatos gazdasági fellendülés minden eddigi konjunktúra-időszaknál hosszabb.',
 '1\tA\ta\tDET\t_\tDefinite=Def|PronType=Art\t2\tdet\t_\t_',
 '2\tvilágban\tvilág\tNOUN\t_\tCase=Ine|Number=Sing|Number[psed]=None|Number[psor]=None|Person[psor]=None\t13\tnmod:obl\t_\t_',
 '3\timmár\timmár\tADV\t_\t_\t13\tadvmod:mode\t_\t_',
 '4\ttíz\ttíz\tNUM\t_\tCase=Nom|Number=Sing|Number[psed]=None|Number[psor]=None|NumType=Card|Person[psor]=None\t5\tnummod\t_\t_',
 '5\téve\tév\tNOUN\t_\tCase=Nom|Number=Sing|Number[psed]=None|Number[psor]=Sing|Person[psor]=3\t6\tobl\t_\t_',
 '6\ttartó\ttartó\tADJ\t_\tCase=Nom|Number=Sing|Number[psed]=None|Number[psor]=None|Person[psor]=None|VerbForm=PartPres\t9\tamod:att\t_\t_',
 '7\tfolyamatos\tfolyamatos\tADJ\t_\tCase=Nom|Degree=Pos|Number=Sing|Number[psed]=None|Number[psor]=None|Person[psor]=None\t9\tamod:att\t_\t_',
 '8\tgazdasági\tgazdasági\tADJ\t_\tCase=Nom|Degree=Pos|Number=Sing|Number[psed]=Non

In [12]:
words = []
wordstems = []
pos = []
particles = []

for r in rows:
    if r != '':
        if r[0] != '#':
            curlist = r.split('\t')
            if curlist[3].upper() != 'X':
                words.append(curlist[1].lower())
                wordstems.append(curlist[2].lower())
                pos.append(curlist[3].upper())
                particles.append(curlist[5])

postag_df = pd.DataFrame(data={'word':words, 'stem':wordstems, 'pos':pos, 'parts':particles})

In [13]:
postag_df

Unnamed: 0,word,stem,pos,parts
0,a,a,DET,Definite=Def|PronType=Art
1,világban,világ,NOUN,Case=Ine|Number=Sing|Number[psed]=None|Number[...
2,immár,immár,ADV,_
3,tíz,tíz,NUM,Case=Nom|Number=Sing|Number[psed]=None|Number[...
4,éve,év,NOUN,Case=Nom|Number=Sing|Number[psed]=None|Number[...
...,...,...,...,...
20149,",",",",PUNCT,_
20150,a,a,DET,Definite=Def|PronType=Art
20151,tudományos,tudományos,ADJ,Case=Nom|Degree=Pos|Number=Sing
20152,akadémiára,akadémia,NOUN,Case=Sub|Number=Sing


In [14]:
set(postag_df['pos'])

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'VERB'}

In [106]:
part_types = list(set([item2 for sublist2 in [list(set([item for sublist in [[b.split('=')[0] for b in a.split('|')] for a in list(postag_df[postag_df['pos'] == t]['parts'])] for item in sublist])) for t in set(postag_df['pos'])] for item2 in sublist2]).difference(set(['_'])))
part_types

['Person',
 'PronType',
 'Aspect',
 'Number',
 'Voice',
 'Definite',
 'Case',
 'Poss',
 'VerbForm',
 'Degree',
 'NumType',
 'Tense',
 'Number[psor]',
 'Reflex',
 'Person[psor]',
 'Mood',
 'Number[psed]']

In [125]:
part_columns = []

In [126]:
for a in part_types:
    cur_list = []
    for p in list(postag_df['parts']):
        found = False
        for t in p.split('|'):
            tl = t.split('=')
            if a == tl[0]:
                cur_list.append(tl[1])
                found = True
                break
        if not found:
            cur_list.append('None')
    part_columns.append(cur_list)

In [128]:
for a in range(len(part_types)):
    postag_df[part_types[a]] = part_columns[a]

In [129]:
postag_df

Unnamed: 0,word,stem,pos,parts,Person,PronType,Aspect,Number,Voice,Definite,...,Poss,VerbForm,Degree,NumType,Tense,Number[psor],Reflex,Person[psor],Mood,Number[psed]
0,a,a,DET,Definite=Def|PronType=Art,,Art,,,,Def,...,,,,,,,,,,
1,világban,világ,NOUN,Case=Ine|Number=Sing|Number[psed]=None|Number[...,,,,Sing,,,...,,,,,,,,,,
2,immár,immár,ADV,_,,,,,,,...,,,,,,,,,,
3,tíz,tíz,NUM,Case=Nom|Number=Sing|Number[psed]=None|Number[...,,,,Sing,,,...,,,,Card,,,,,,
4,éve,év,NOUN,Case=Nom|Number=Sing|Number[psed]=None|Number[...,,,,Sing,,,...,,,,,,Sing,,3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20149,",",",",PUNCT,_,,,,,,,...,,,,,,,,,,
20150,a,a,DET,Definite=Def|PronType=Art,,Art,,,,Def,...,,,,,,,,,,
20151,tudományos,tudományos,ADJ,Case=Nom|Degree=Pos|Number=Sing,,,,Sing,,,...,,,Pos,,,,,,,
20152,akadémiára,akadémia,NOUN,Case=Sub|Number=Sing,,,,Sing,,,...,,,,,,,,,,


In [130]:
#postag_df.drop(['parts'], axis=1, inplace=True)

In [134]:
postag_df

Unnamed: 0,word,stem,pos,Person,PronType,Aspect,Number,Voice,Definite,Case,Poss,VerbForm,Degree,NumType,Tense,Number[psor],Reflex,Person[psor],Mood,Number[psed]
0,a,a,DET,,Art,,,,Def,,,,,,,,,,,
1,világban,világ,NOUN,,,,Sing,,,Ine,,,,,,,,,,
2,immár,immár,ADV,,,,,,,,,,,,,,,,,
3,tíz,tíz,NUM,,,,Sing,,,Nom,,,,Card,,,,,,
4,éve,év,NOUN,,,,Sing,,,Nom,,,,,,Sing,,3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20149,",",",",PUNCT,,,,,,,,,,,,,,,,,
20150,a,a,DET,,Art,,,,Def,,,,,,,,,,,
20151,tudományos,tudományos,ADJ,,,,Sing,,,Nom,,,Pos,,,,,,,
20152,akadémiára,akadémia,NOUN,,,,Sing,,,Sub,,,,,,,,,,


In [144]:
postag_df_ohe = postag_df[['word','stem','pos']] # ohe = one-hot encoded
onehot_encoded_dfs = []
for a in part_types:
    cur_df = pd.get_dummies(postag_df[a], prefix=a)
    cur_df.drop([a + '_None'], axis=1, inplace=True)
    postag_df_ohe = pd.concat([postag_df_ohe, cur_df], axis=1, sort=False)

In [148]:
postag_df_ohe = postag_df_ohe[postag_df_ohe['pos'] != 'PUNCT'].reset_index()

In [149]:
postag_df_ohe

Unnamed: 0,index,word,stem,pos,Person_1,Person_2,Person_3,PronType_Art,PronType_Dem,PronType_Ind,...,Reflex_Yes,Person[psor]_1,Person[psor]_3,Mood_Cnd,"Mood_Cnd,Pot",Mood_Imp,"Mood_Imp,Pot",Mood_Ind,Mood_Pot,Number[psed]_Sing
0,0,a,a,DET,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,világban,világ,NOUN,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,immár,immár,ADV,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,tíz,tíz,NUM,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,éve,év,NOUN,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17265,20147,el,el,ADV,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17266,20148,belgrádba,belgrád,PROPN,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17267,20150,a,a,DET,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17268,20151,tudományos,tudományos,ADJ,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [173]:
s = ''
for w in list(postag_df_ohe['word']):
    s += w
L = sorted(list(set(s)))
encode_dict = {}
decode_dict = {}
N = len(L)
for c in range(N):
    encode_dict[L[c]] = c
    decode_dict[c] = L[c]

print(encode_dict)
print(decode_dict)

{'"': 0, ',': 1, '-': 2, '.': 3, '0': 4, '1': 5, '2': 6, '3': 7, '4': 8, '5': 9, '6': 10, '7': 11, '8': 12, '9': 13, '?': 14, 'a': 15, 'b': 16, 'c': 17, 'd': 18, 'e': 19, 'f': 20, 'g': 21, 'h': 22, 'i': 23, 'j': 24, 'k': 25, 'l': 26, 'm': 27, 'n': 28, 'o': 29, 'p': 30, 'q': 31, 'r': 32, 's': 33, 't': 34, 'u': 35, 'v': 36, 'w': 37, 'x': 38, 'y': 39, 'z': 40, 'á': 41, 'ä': 42, 'é': 43, 'í': 44, 'ó': 45, 'ö': 46, 'ú': 47, 'ü': 48, 'ő': 49, 'ű': 50}
{0: '"', 1: ',', 2: '-', 3: '.', 4: '0', 5: '1', 6: '2', 7: '3', 8: '4', 9: '5', 10: '6', 11: '7', 12: '8', 13: '9', 14: '?', 15: 'a', 16: 'b', 17: 'c', 18: 'd', 19: 'e', 20: 'f', 21: 'g', 22: 'h', 23: 'i', 24: 'j', 25: 'k', 26: 'l', 27: 'm', 28: 'n', 29: 'o', 30: 'p', 31: 'q', 32: 'r', 33: 's', 34: 't', 35: 'u', 36: 'v', 37: 'w', 38: 'x', 39: 'y', 40: 'z', 41: 'á', 42: 'ä', 43: 'é', 44: 'í', 45: 'ó', 46: 'ö', 47: 'ú', 48: 'ü', 49: 'ő', 50: 'ű'}


In [174]:
def encode(w):
    ret = []
    for c in w:
        ret.append(encode_dict[c])
    return np.array(ret)

In [175]:
encode('sziasztok')

array([33, 40, 23, 15, 33, 40, 34, 29, 25])

In [176]:
def decode(a):
    ret = []
    for i in a:
        ret.append(decode_dict[i])
    return ''.join(ret)

In [177]:
decode([33, 40, 23, 15, 33, 40, 34, 29, 25])

'sziasztok'

In [47]:
# word2vec előkészületek

# nem jól címkézett szavak a tanító adatban
wrong_words = [',','-','007-esből','11','1999.június','2','2000','2000"-nek','?','a','a4','és']
w2v_words = []
w2v_wordstems = []
w2v_types = []
for r in rows:
    if r != '':
        if r[0] != '#':
            curlist = r.split('\t')
            wordtype = curlist[3]
            if wordtype in ['ADJ', 'NOUN', 'PROPN', 'VERB']:
                stem = curlist[2].lower()
                if stem not in wrong_words:
                    w2v_words.append(curlist[1].lower())
                    w2v_wordstems.append(stem)
                    w2v_types.append(wordtype)

In [48]:
w2v_wordstems

['világ',
 'év',
 'tartó',
 'folyamatos',
 'gazdasági',
 'fellendülés',
 'eddigi',
 'konjunktúra-időszak',
 'hosszú',
 'gazdaság',
 'mértékű',
 'fejlődés',
 'folyamat',
 'gerjeszt',
 'magánosítás',
 'elterjedés',
 'tőkekoncentráció',
 'globalizáció',
 'magyar',
 'gazdaság',
 'gyakorolt',
 'hatás',
 'békesi',
 'lászló',
 'van',
 'pénzügyminiszter',
 'tart',
 'előadás',
 'pázmány',
 'péter',
 'katolikus',
 'egyetem',
 'jogtudományi',
 'kara',
 'barankovics',
 'faludi',
 'akadémia',
 'pázmány',
 'pódium',
 'közös',
 'szervezett',
 'vitasorozat',
 'keret',
 'világgazdaság',
 'gyors',
 'országhatár',
 'átívelő',
 'fejlődés',
 'tényező',
 'idéz',
 'verseny',
 'szint',
 'folyik',
 'természeti',
 'tényező',
 'nyersanyag',
 'energiahordozó',
 'munkaerő',
 'információ',
 'technológia',
 'vívott',
 'harc',
 'nincs',
 'korlát',
 'piac',
 'folyó',
 'világméretű',
 'verseny',
 'fizetőképes',
 'kereslet',
 'megjelenés',
 'tesz',
 'lehető',
 'piaci',
 'szereplő',
 'verseng',
 'versenyképesség',
 'utób

In [49]:
w2v_types

['NOUN',
 'NOUN',
 'ADJ',
 'ADJ',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'PROPN',
 'PROPN',
 'VERB',
 'NOUN',
 'VERB',
 'NOUN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'PROPN',
 'ADJ',
 'ADJ',
 'NOUN',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'ADJ',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'ADJ',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'NOUN',
 'VERB',
 'ADJ',
 'NOUN',
 'N

In [67]:
w2v_words

['világban',
 'éve',
 'tartó',
 'folyamatos',
 'gazdasági',
 'fellendülés',
 'eddigi',
 'konjunktúra-időszaknál',
 'hosszabb',
 'gazdaság',
 'mértékű',
 'fejlődését',
 'folyamat',
 'gerjeszti',
 'magánosítás',
 'elterjedése',
 'tőkekoncentráció',
 'globalizációról',
 'magyar',
 'gazdaságra',
 'gyakorolt',
 'hatásairól',
 'békesi',
 'lászló',
 'volt',
 'pénzügyminiszter',
 'tartott',
 'előadást',
 'pázmány',
 'péter',
 'katolikus',
 'egyetem',
 'jogtudományi',
 'karán',
 'barankovics',
 'faludi',
 'akadémia',
 'pázmány',
 'pódium',
 'közösen',
 'szervezett',
 'vitasorozat',
 'keretében',
 'világgazdaság',
 'gyors',
 'országhatárokon',
 'átívelő',
 'fejlődését',
 'tényező',
 'idézi',
 'verseny',
 'szinten',
 'folyik',
 'természeti',
 'tényezőkért',
 'nyersanyagért',
 'energiahordozókért',
 'munkaerőért',
 'információért',
 'technológiáért',
 'vívott',
 'harcnak',
 'nincsenek',
 'korlátai',
 'piacokért',
 'folyó',
 'világméretű',
 'versenyt',
 'fizetőképes',
 'kereslet',
 'megjelenése',
 

In [50]:
len(w2v_wordstems)

10144

In [717]:
def dist(w1, w2, stem_dictionary):
    
    if w1 == w2:
        return 0
    
    w1_indexes = [i for i, w in enumerate(stem_dictionary) if w == w1]
    w2_indexes = [i for i, w in enumerate(stem_dictionary) if w == w2]
    
    if len(w1_indexes) == 0 or len(w2_indexes) == 0:
        return '?'
    
    t = 0
    for i in w1_indexes:
        for j in w2_indexes:
            diff = abs(i-j)
            if diff <= 10:
                t += math.pow(0.9,diff-1)
    
    t /= (len(w1_indexes)*len(w2_indexes))
    
    if t == 0:
        return 1024
    else:
        return math.log2(1/t + 1)

In [718]:
dist('év', 'tart', w2v_wordstems)

9.806931135948489

In [719]:
dist('nagy', 'tér', w2v_wordstems)

7.179027673573768

In [720]:
dist('magyar', 'magyarország', w2v_wordstems)

8.868592584146676

In [721]:
dist('térd', 'láb', w2v_wordstems)

2.354798077446704

In [724]:
dist('ismétel', 'unatkozik', w2v_wordstems)

1.8404329361350282

In [738]:
def create_word2vec(distance_matrix):
    
    # Numpyify the distance matrix
    D = np.array(distance_matrix)
    n = len(distance_matrix)
    
    # Create matrix B:
    B = (-1/2) * np.square(D)
    
    # Eigendecomposition of B
    l, v = np.linalg.eig(B)
    
    # Keep only positive eigenvalues and corresponding eigenvectors
    l_pos = []
    v_pos = []
    for i in range(len(l)):
        if l[i] > 0:
            l_pos.append(l[i])
            v_pos.append(v[i]/np.linalg.norm(v[i]))
    
    v_pos = np.transpose(np.array(v_pos))
    
    # Return matrix = v_pos * diag(l_pos) ^ 1/2
    R = np.dot(v_pos,np.diag([math.sqrt(l) for l in l_pos]))
    
    return R

In [739]:
w2v_wordstem_set = list(set(w2v_wordstems))
D_wordstems = [[0 for j in range(len(w2v_wordstem_set))] for i in range(len(w2v_wordstem_set))]
i = 0
for w1 in w2v_wordstem_set:
    print(i)
    for w2 in w2v_wordstem_set:
        D_wordstems[i][j] = dist(w1, w2, w2v_wordstem_set)
    i += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13


KeyboardInterrupt: 

In [676]:
word2vec_test = create_word2vec(D_wordstems)