## More Data

As I have mentioned in the last Notebook, the amount of data wasn't nearly sufficient, so I instead looked for a different source.

Even though I have started looking further away from home, I came back and found a Hungarian Webcorpus at BME at the website `ftp://ftp.mokk.bme.hu/Language/Hungarian/Corp/Webcorp/ana/`. I have used the file called `all_morphtable`, and exactly the rows from 2,294,693 to 7,657,427. This data is also available in the `DATA` folder.

This gave me about 5,000,000 decomposed words - meaning it's a 250x increase from the previous amount of 20,000.

The following lines simply transform the data into multiple dataframes. However, these don't need to be ran by the user, I have already presaved the necessary files for the final predictor to work.

In [1]:
import numpy as np
import math
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [5]:
with open('../DATA/all_morphtable.txt', 'r', encoding='ISO-8859-2') as f:
    data = f.read()

In [6]:
rows = data.split('\n')

In [7]:
del data

In [8]:
words = []
wordstems = []
morphemes = []

non_monitored_morphemes = ['\+\?NOUN','NUM','PUNCT']

for r in rows:
    
    split_row = r.split('\t')
    if len(split_row) > 1:
        for m in range(len(split_row)-1):
            
            morph_split = split_row[m+1].split('/')
            if len(morph_split) == 2:
                
                m = morph_split[1]
                
                if not ('+?NOUN' in m or m == 'NUM' or m[:4] == 'NUM<' or m == 'PUNCT'):

                    words.append(split_row[0].lower())
                    wordstems.append(morph_split[0].lower())
                    morphemes.append(m)

In [9]:
del rows

In [10]:
words_df = pd.DataFrame({'word':words,'stem':wordstems,'morphemes':morphemes})

In [11]:
words_df

Unnamed: 0,word,stem,morphemes
0,aa,aa,NOUN
1,aa-bb,aa-bb,NOUN
2,aa-lávaként,aa-láv,NOUN<POSS><CAS<FOR>>
3,aa-val,aa-val,NOUN
4,aa.,aa.,NOUN
...,...,...,...
4433353,üönképzőkör,üönképzőkör,NOUN
4433354,üötvefúróhoz,üötvefúró,NOUN<CAS<ALL>>
4433355,üúcijöcuxz,üúcijöcuxz,NOUN
4433356,üű,üű,NOUN


In [12]:
words_df.to_csv('words_df.csv', index = False)

In [13]:
del words
del wordstems
del morphemes

In [11]:
morpheme_combos = set(list(words_df['morphemes']))
morpheme_combos

{'ADJ',
 'NOUN<POSTP<KÖZÜL>><PERS><PLUR>',
 'DET<PLUR><ANP<PLUR>><CAS<ABL>>',
 'NOUN<PERS><PLUR><ANP<PLUR>><CAS<SBL>>',
 'NOUN<POSTP<MELLETT>><PERS<2>><PLUR>',
 'NOUN<POSTP<KÖRÉ>><PERS<1>><PLUR>',
 'NOUN<PLUR><POSS<2><PLUR>><CAS<ELA>>',
 'NOUN<POSTP<MIATT>><PERS<2>>',
 'NOUN<POSTP<FELÉ>><PERS<1>>',
 'NOUN<PERS<2>><ANP<PLUR>><CAS<ELA>>',
 'ADJ<PLUR<FAM>><CAS<ABL>>',
 'NOUN<ANP><CAS<DEL>>',
 'NOUN<PLUR><POSS><ANP><CAS<SBL>>',
 'ADJ<PLUR><POSS<PLUR>><CAS<ALL>>',
 'NOUN<PERS<1>><CAS<INE>>',
 'NOUN<PERS<2>><PLUR><ANP><ANP>',
 'ADJ<PLUR><CAS<DEL>>',
 'NOUN<PERS><ANP<PLUR>><CAS<DAT>>',
 'NOUN<PLUR<FAM>><ANP><CAS<DAT>>',
 'NOUN<PERS><CAS<DEL>>',
 'VERB<SUBJUNC-IMP>{ORTH:substandard}<PLUR>',
 'NOUN<PLUR><POSS<2><PLUR>><CAS<ACC>>',
 'ADJ<PLUR><POSS<1><PLUR>><CAS<ADE>>',
 'NOUN<PERS><CAS<INE>>',
 'DET<PLUR><ANP<PLUR>>',
 'NOUN<PLUR><POSS<1>><CAS<CAU>>',
 'ADJ<PLUR><ANP><CAS<TER>>',
 'NOUN<POSS<2>><CAS<ELA>>',
 'ADJ<POSS<PLUR>><CAS<TRA>>',
 'NOUN<PERS<2>><ANP><CAS<INE>>',
 'NOUN<PLUR><ANP><CAS<SBL

In [12]:
set([w.split('<')[0] for w in morpheme_combos])

{'ADJ',
 'ADV',
 'ART',
 'CONJ',
 'DET',
 'NOUN',
 'NUM[AGGREG]',
 'NUM[MULTIPL]',
 'NUM[ORD-ITER-ACCOMPL]',
 'ONO',
 'POSTP',
 'PREP',
 'PREV',
 'PREV{ORTH:substandard}',
 'UTT-INT',
 'VERB',
 'VERB{ORTH:substandard}'}

In [3]:
words_df = pd.read_csv('words_df.csv')

In [18]:
adverbs = words_df[words_df['morphemes'] == 'ADV'].reset_index(drop=True)

In [19]:
adverbs

Unnamed: 0,word,stem,morphemes
0,abszolút,abszolút,ADV
1,abszolúte,abszolúte,ADV
2,addig-ameddig,addig-ameddig,ADV
3,addiglan,addiglan,ADV
4,addigra,addigra,ADV
...,...,...,...
2121,ültében,ültében,ADV
2122,ültéből,ültéből,ADV
2123,ünnepnap,ünnepnap,ADV
2124,üptre,üptre,ADV


In [21]:
adverbs.to_csv('adverbs.csv', index=False)

In [14]:
words_df[words_df['morphemes'] == 'ART'].reset_index(drop=True)

Unnamed: 0,word,stem,morphemes
0,az,a,ART
1,egy,egy,ART
2,egy,egy,ART
3,egy,egy,ART


In [15]:
conjs = words_df[words_df['morphemes'] == 'CONJ'].reset_index(drop=True)
conjs

Unnamed: 0,word,stem,morphemes
0,addig,addig,CONJ
1,addig-addig,addig-addig,CONJ
2,addigis,addigis,CONJ
3,akár,akár,CONJ
4,akárha,akárha,CONJ
...,...,...,...
250,úgyis,úgyis,CONJ
251,úgymint,úgymint,CONJ
252,úgynevezett,úgynevezett,CONJ
253,úgysem,úgysem,CONJ


In [16]:
conjs.to_csv('conjs.csv', index=False)

In [17]:
dets = words_df[words_df['morphemes'] == 'DET'].reset_index(drop=True)
dets

Unnamed: 0,word,stem,morphemes
0,ama,ama,DET
1,azon,azon,DET
2,egyazon,egyazon,DET
3,egyugyanazon,egyugyanazon,DET
4,eme,eme,DET
5,emez,emez,DET
6,ezen,ezen,DET
7,mindama,mindama,DET
8,mindaz,mindaz,DET
9,mindazon,mindazon,DET


In [18]:
dets.to_csv('dets.csv', index=False)

In [19]:
nums_aggreg = words_df[words_df['morphemes'] == 'NUM[AGGREG]'].reset_index(drop=True)
nums_aggreg

Unnamed: 0,word,stem,morphemes
0,ahányan,ahány,NUM[AGGREG]
1,akárhanyan,akárhány,NUM[AGGREG]
2,akárhányan,akárhány,NUM[AGGREG]
3,annyian,annyi,NUM[AGGREG]
4,bárhányan,bárhány,NUM[AGGREG]
5,csomóan,csomó,NUM[AGGREG]
6,egyen,egy,NUM[AGGREG]
7,egynéhányan,egynéhány,NUM[AGGREG]
8,egypáran,egypár,NUM[AGGREG]
9,ennyien,ennyi,NUM[AGGREG]


In [20]:
nums_aggreg.to_csv('nums_aggreg.csv', index=False)

In [21]:
nums_multipl = words_df[words_df['morphemes'] == 'NUM[MULTIPL]'].reset_index(drop=True)
nums_multipl

Unnamed: 0,word,stem,morphemes
0,ahányszorta,ahány,NUM[MULTIPL]
1,annyiszorta,annyi,NUM[MULTIPL]
2,egyszerte,egy,NUM[MULTIPL]
3,hatszorta,hat,NUM[MULTIPL]
4,hányszorta,hány,NUM[MULTIPL]
5,háromszorta,három,NUM[MULTIPL]
6,hétszerte,hét,NUM[MULTIPL]
7,hússzorta,húsz,NUM[MULTIPL]
8,kétszerte,kettő,NUM[MULTIPL]
9,milliárdszorta,milliárd,NUM[MULTIPL]


In [22]:
nums_multipl.to_csv('nums_multipl.csv', index=False)

In [36]:
nums_iter = words_df[words_df['morphemes'] == 'NUM[ORD-ITER-ACCOMPL]'].reset_index(drop=True)
nums_iter

Unnamed: 0,word,stem,morphemes
0,akárhányadszorra,akárhány,NUM[ORD-ITER-ACCOMPL]
1,hanyadszorra,hány,NUM[ORD-ITER-ACCOMPL]
2,harmadszorra,három,NUM[ORD-ITER-ACCOMPL]
3,harmincadszorra,harminc,NUM[ORD-ITER-ACCOMPL]
4,hatodszorra,hat,NUM[ORD-ITER-ACCOMPL]
5,hatvanadszorra,hatvan,NUM[ORD-ITER-ACCOMPL]
6,huszadszorra,húsz,NUM[ORD-ITER-ACCOMPL]
7,hányadszorra,hány,NUM[ORD-ITER-ACCOMPL]
8,milliomodszorra,millió,NUM[ORD-ITER-ACCOMPL]
9,másodszorra,kettő,NUM[ORD-ITER-ACCOMPL]


In [37]:
nums_iter.to_csv('nums_iter.csv', index=False)

In [25]:
onos = words_df[words_df['morphemes'] == 'ONO'].reset_index(drop=True)
onos

Unnamed: 0,word,stem,morphemes
0,bee,bee,ONO
1,brekeke,brekeke,ONO
2,brr,brr,ONO
3,bruhaha,bruhaha,ONO
4,brumm-brumm,brumm-brumm,ONO
...,...,...,...
75,zupp,zupp,ONO
76,zutty,zutty,ONO
77,züm-züm,züm-züm,ONO
78,ú,ú,ONO


In [26]:
onos.to_csv('onos.csv', index=False)

In [27]:
postps = words_df[words_df['morphemes'] == 'POSTP'].reset_index(drop=True)
postps

Unnamed: 0,word,stem,morphemes
0,adódóan,adódóan,POSTP
1,alapján,alapján,POSTP
2,alatt,alatt,POSTP
3,alul,alul,POSTP
4,alá,alá,POSTP
...,...,...,...
160,érdekében,érdekében,POSTP
161,ízben,ízben,POSTP
162,óta,óta,POSTP
163,útján,útján,POSTP


In [28]:
postps.to_csv('postps.csv', index=False)

In [29]:
preps = words_df[words_df['morphemes'] == 'PREP'].reset_index(drop=True)
preps

Unnamed: 0,word,stem,morphemes
0,alias,alias,PREP
1,kivéve,kivéve,PREP
2,kontra,kontra,PREP
3,pró,pró,PREP


In [30]:
preps.to_csv('preps.csv', index=False)

In [31]:
prevs = words_df[words_df['morphemes'] == 'PREV'].reset_index(drop=True)
prevs

Unnamed: 0,word,stem,morphemes
0,abba,abba,PREV
1,agyon,agyon,PREV
2,alul,alul,PREV
3,alá,alá,PREV
4,be,be,PREV
...,...,...,...
123,őrizetlen,őrizetlen,PREV
124,össze,össze,PREV
125,újjá,újjá,PREV
126,újra,újra,PREV


In [32]:
prevs.to_csv('prevs.csv', index=False)

In [33]:
utt_ints = words_df[words_df['morphemes'] == 'UTT-INT'].reset_index(drop=True)
utt_ints

Unnamed: 0,word,stem,morphemes
0,abcúg,abcúg,UTT-INT
1,addsza,addsza,UTT-INT
2,adieu,adieu,UTT-INT
3,adjonisten,adjonisten,UTT-INT
4,adta,adta,UTT-INT
...,...,...,...
451,üdv,üdv,UTT-INT
452,üdv,üdv,UTT-INT
453,üdvözlet,üdvözlet,UTT-INT
454,üdvözlöm,üdvözlöm,UTT-INT


In [34]:
utt_ints.to_csv('utt_ints.csv', index=False)

In [35]:
nums_regular = words_df[words_df['morphemes'] == 'NUM'].reset_index(drop=True)
nums_regular

Unnamed: 0,word,stem,morphemes


In [276]:
words_df

Unnamed: 0,word,stem,morphemes
0,aa,aa,NOUN
1,aa-bb,aa-bb,NOUN
2,aa-lávaként,aa-láv,NOUN<POSS><CAS<FOR>>
3,aa-val,aa-val,NOUN
4,aa.,aa.,NOUN
...,...,...,...
4433353,üönképzőkör,üönképzőkör,NOUN
4433354,üötvefúróhoz,üötvefúró,NOUN<CAS<ALL>>
4433355,üúcijöcuxz,üúcijöcuxz,NOUN
4433356,üű,üű,NOUN


In [277]:
words_df['morphemes'] = words_df['morphemes'].str.replace('\{ORTH\:substandard\}','')
words_df['stem'] = words_df['stem'].str.replace('\{orth\:substandard\}','')

In [278]:
morpheme_combos = set(list(words_df['morphemes']))
set([w.split('<')[0] for w in morpheme_combos])

{'ADJ',
 'ADV',
 'ART',
 'CONJ',
 'DET',
 'NOUN',
 'NUM[AGGREG]',
 'NUM[MULTIPL]',
 'NUM[ORD-ITER-ACCOMPL]',
 'ONO',
 'POSTP',
 'PREP',
 'PREV',
 'UTT-INT',
 'VERB'}

In [279]:
words_df

Unnamed: 0,word,stem,morphemes
0,aa,aa,NOUN
1,aa-bb,aa-bb,NOUN
2,aa-lávaként,aa-láv,NOUN<POSS><CAS<FOR>>
3,aa-val,aa-val,NOUN
4,aa.,aa.,NOUN
...,...,...,...
4433353,üönképzőkör,üönképzőkör,NOUN
4433354,üötvefúróhoz,üötvefúró,NOUN<CAS<ALL>>
4433355,üúcijöcuxz,üúcijöcuxz,NOUN
4433356,üű,üű,NOUN


In [83]:
def morpheme_list_transform(w):
    inside_brackets = 0
    ret_list = []
    cur_word = ''
    for c in w:
        if c == '>':
            inside_brackets -= 1
        if inside_brackets == 0 and len(cur_word) > 0:
            ret_list.append(cur_word)
            cur_word = ''
        if inside_brackets >= 1:
            cur_word += c
        if c == '<':
            inside_brackets += 1
    return ret_list

In [84]:
def word_type_transform(w):
    return w.split('<')[0]

In [282]:
w = 'NOUN<POSS><CAS<FOR>>'
print(word_type_transform(w))
print(morpheme_list_transform(w))

NOUN
['POSS', 'CAS<FOR>']


In [283]:
#[morpheme_list_transform(w) for w in list(words_df['morphemes']) if morpheme_list_transform(w) != 0]

In [304]:
sorted(list(set([item for sublist in [morpheme_list_transform(w) for w in list(words_df['morphemes']) if morpheme_list_transform(w) != 0] for item in sublist])))

['ANP',
 'ANP<PLUR>',
 'CAS<ABL>',
 'CAS<ACC>',
 'CAS<ADE>',
 'CAS<ALL>',
 'CAS<CAU>',
 'CAS<DAT>',
 'CAS<DEL>',
 'CAS<ELA>',
 'CAS<ESS>',
 'CAS<FOR>',
 'CAS<ILL>',
 'CAS<INE>',
 'CAS<INS>',
 'CAS<SBL>',
 'CAS<SUE>',
 'CAS<TEM>',
 'CAS<TER>',
 'CAS<TRA>',
 'COND',
 'COND<PAST>',
 'DEF',
 'INF',
 'MODAL',
 'PAST',
 'PERS',
 'PERS<1<OBJ<2>>>',
 'PERS<1>',
 'PERS<2>',
 'PLUR',
 'PLUR<ANP>',
 'PLUR<FAM>',
 'POSS',
 'POSS<1>',
 'POSS<1><PLUR>',
 'POSS<2>',
 'POSS<2><PLUR>',
 'POSS<PLUR>',
 'POSTP<ALATT>',
 'POSTP<ALÁ>',
 'POSTP<ALÓL>',
 'POSTP<ELLEN>',
 'POSTP<ELLENÉRE>',
 'POSTP<ELÉ>',
 'POSTP<ELÉBE>',
 'POSTP<ELŐL>',
 'POSTP<ELŐTT>',
 'POSTP<FELETT>',
 'POSTP<FELÉ>',
 'POSTP<FELÜL>',
 'POSTP<FELŐL>',
 'POSTP<FÖLIBE>',
 'POSTP<FÖLÉ>',
 'POSTP<FÖLÜL>',
 'POSTP<HELYETT>',
 'POSTP<IRÁNT>',
 'POSTP<KÖRÉ>',
 'POSTP<KÖRÖTT>',
 'POSTP<KÖRÜL>',
 'POSTP<KÖZBEN>',
 'POSTP<KÖZIBE>',
 'POSTP<KÖZÉ>',
 'POSTP<KÖZÖTT>',
 'POSTP<KÖZÜL>',
 'POSTP<LÉTÉRE>',
 'POSTP<MELLETT>',
 'POSTP<MELLÉ>',
 'POSTP<MELLŐL

In [291]:
nouns = words_df[words_df['morphemes'].str.contains('NOUN')].reset_index()
nouns

Unnamed: 0,index,word,stem,morphemes
0,0,aa,aa,NOUN
1,1,aa-bb,aa-bb,NOUN
2,2,aa-lávaként,aa-láv,NOUN<POSS><CAS<FOR>>
3,3,aa-val,aa-val,NOUN
4,4,aa.,aa.,NOUN
...,...,...,...,...
3963996,4433353,üönképzőkör,üönképzőkör,NOUN
3963997,4433354,üötvefúróhoz,üötvefúró,NOUN<CAS<ALL>>
3963998,4433355,üúcijöcuxz,üúcijöcuxz,NOUN
3963999,4433356,üű,üű,NOUN


In [292]:
verbs = words_df[words_df['morphemes'].str.contains('VERB')].reset_index()
verbs

Unnamed: 0,index,word,stem,morphemes
0,240,abajgasd,abajgat,VERB<SUBJUNC-IMP><PERS<2>><DEF>
1,241,abajgassák,abajgat,VERB<SUBJUNC-IMP><PLUR><DEF>
2,242,abajgat,abajgat,VERB
3,243,abajgatja,abajgat,VERB<DEF>
4,244,abajgatják,abajgat,VERB<PLUR><DEF>
...,...,...,...,...
382350,4433321,üzérkednek,üzérkedik,VERB<PLUR>
382351,4433322,üzérkedni,üzérkedik,VERB<INF>
382352,4433323,üzérkedsz,üzérkedik,VERB<PERS<2>>
382353,4433324,üzérkedtek,üzérkedik,VERB<PAST><PLUR>


In [108]:
nums = words_df[words_df['morphemes'].str.contains('NUM')].reset_index()
nums

Unnamed: 0,index,word,stem,morphemes
0,40951,ahányan,ahány,NUM[AGGREG]
1,40959,ahányszorta,ahány,NUM[MULTIPL]
2,56431,akárhanyan,akárhány,NUM[AGGREG]
3,56447,akárhányadszorra,akárhány,NUM[ORD-ITER-ACCOMPL]
4,56448,akárhányan,akárhány,NUM[AGGREG]
...,...,...,...,...
76,4131167,öten,öt,NUM[AGGREG]
77,4219084,édeskevesen,édeskevés,NUM[AGGREG]
78,4376737,öten,öt,NUM[AGGREG]
79,4378357,ötszörte,öt,NUM[MULTIPL]


In [79]:
adjectives = words_df[words_df['morphemes'].str.contains('ADJ')].reset_index()
adjectives

Unnamed: 0,index,word,stem,morphemes
0,526,abbeli,abbeli,ADJ
1,770,aberrált,aberrált,ADJ
2,771,aberráltak,aberrált,ADJ<PLUR>
3,772,aberráltakat,aberrált,ADJ<PLUR><CAS<ACC>>
4,773,aberráltját,aberrált,ADJ<POSS><CAS<ACC>>
...,...,...,...,...
83629,4431797,üzletivé,üzleti,ADJ<CAS<TRA>>
83630,4432753,üzletszerű,üzletszerű,ADJ
83631,4432754,üzletszerűek,üzletszerű,ADJ<PLUR>
83632,4432755,üzletszerűnek,üzletszerű,ADJ<CAS<DAT>>


In [295]:
determiners = words_df[words_df['morphemes'].str.contains('DET')].reset_index()
determiners

Unnamed: 0,index,word,stem,morphemes
0,51995,akként,az,DET<CAS<FOR>>
1,105563,ama,ama,DET
2,105688,amannak,amaz,DET<CAS<DAT>>
3,105693,amannál,amaz,DET<CAS<ADE>>
4,106293,amazok,amaz,DET<PLUR>
...,...,...,...,...
63,3771641,ugyanazoknak,ugyanaz,DET<PLUR><CAS<DAT>>
64,3771653,ugyanazon,ugyanazon,DET
65,3771715,ugyanekkor,ugyanez,DET<CAS<TEM>>
66,3771767,ugyanezeken,ugyanez,DET<PLUR><CAS<SUE>>


In [131]:
morphemes_col = list(words_df['morphemes'])
other = words_df.iloc[[i for i in range(len(morphemes_col)) if
                      (morphemes_col[i][:3] == 'ADV' or
                      morphemes_col[i][:3] == 'ART' or
                      morphemes_col[i][:4] == 'CONJ' or
                      morphemes_col[i][:3] == 'ONO' or
                      morphemes_col[i][:5] == 'POSTP' or
                      morphemes_col[i][:4] == 'PREP' or
                      morphemes_col[i][:4] == 'PREV' or
                      morphemes_col[i][:7] == 'UTT-INT')],:].reset_index()

In [132]:
other

Unnamed: 0,index,word,stem,morphemes
0,350,abba,abba,PREV
1,673,abcúg,abcúg,UTT-INT
2,4102,abszolút,abszolút,ADV
3,4117,abszolúte,abszolúte,ADV
4,17051,addig,addig,CONJ
...,...,...,...,...
3214,4413671,ültében,ültében,ADV
3215,4413672,ültéből,ültéből,ADV
3216,4416403,ünnepnap,ünnepnap,ADV
3217,4417022,üptre,üptre,ADV


In [133]:
other.drop(['index'],axis=1,inplace=True)

In [134]:
other.to_csv('other.csv',index=False)

In [300]:
'''
nouns.drop(['index'], axis=1, inplace=True)
verbs.drop(['index'], axis=1, inplace=True)
adjectives.drop(['index'], axis=1, inplace=True)
nums.drop(['index'], axis=1, inplace=True)
determiners.drop(['index'], axis=1, inplace=True)
'''

In [81]:
def one_hot_encode_morphemes(df):
    possible_morphemes = sorted(list(set([item for sublist in [morpheme_list_transform(w) for w in list(df['morphemes']) if morpheme_list_transform(w) != 0] for item in sublist])))
    ret = [None for i in range(len(possible_morphemes))]
    df_morphemes = list(df['morphemes'])
    for m in range(len(possible_morphemes)):
        print(m, ' / ', len(possible_morphemes))
        cur_list = [None for l in range(len(df))]
        for r, row in df.iterrows():
            if possible_morphemes[m] in morpheme_list_transform(row['morphemes']):
                cur_list[r] = 1
            else:
                cur_list[r] = 0
        ret[m] = cur_list
    
    for p in range(len(possible_morphemes)):
        df[possible_morphemes[p]] = ret[p]
    
    df.drop(['morphemes'], axis=1, inplace=True)
    
    return df

In [309]:
nouns

Unnamed: 0,word,stem,morphemes
0,aa,aa,NOUN
1,aa-bb,aa-bb,NOUN
2,aa-lávaként,aa-láv,NOUN<POSS><CAS<FOR>>
3,aa-val,aa-val,NOUN
4,aa.,aa.,NOUN
...,...,...,...
3963996,üönképzőkör,üönképzőkör,NOUN
3963997,üötvefúróhoz,üötvefúró,NOUN<CAS<ALL>>
3963998,üúcijöcuxz,üúcijöcuxz,NOUN
3963999,üű,üű,NOUN


In [328]:
nouns = one_hot_encode_morphemes(nouns)
nouns

0  /  77
1  /  77
2  /  77
3  /  77
4  /  77
5  /  77
6  /  77
7  /  77
8  /  77
9  /  77
10  /  77
11  /  77
12  /  77
13  /  77
14  /  77
15  /  77
16  /  77
17  /  77
18  /  77
19  /  77
20  /  77
21  /  77
22  /  77
23  /  77
24  /  77
25  /  77
26  /  77
27  /  77
28  /  77
29  /  77
30  /  77
31  /  77
32  /  77
33  /  77
34  /  77
35  /  77
36  /  77
37  /  77
38  /  77
39  /  77
40  /  77
41  /  77
42  /  77
43  /  77
44  /  77
45  /  77
46  /  77
47  /  77
48  /  77
49  /  77
50  /  77
51  /  77
52  /  77
53  /  77
54  /  77
55  /  77
56  /  77
57  /  77
58  /  77
59  /  77
60  /  77
61  /  77
62  /  77
63  /  77
64  /  77
65  /  77
66  /  77
67  /  77
68  /  77
69  /  77
70  /  77
71  /  77
72  /  77
73  /  77
74  /  77
75  /  77
76  /  77


Unnamed: 0,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,CAS<DAT>,...,POSTP<RÉSZÉRE>,POSTP<RÉSZÉRŐL>,POSTP<SZERINT>,POSTP<SZÁMÁRA>,POSTP<UTÁN>,POSTP<VÉGBŐL>,POSTP<VÉGETT>,POSTP<VÉGRE>,POSTP<ÁLTAL>,POSTP<ÓTA>
0,aa,aa,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,aa-bb,aa-bb,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,aa-lávaként,aa-láv,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,aa-val,aa-val,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,aa.,aa.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3963996,üönképzőkör,üönképzőkör,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3963997,üötvefúróhoz,üötvefúró,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3963998,üúcijöcuxz,üúcijöcuxz,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3963999,üű,üű,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [329]:
nouns.to_csv('nouns.csv')

In [330]:
verbs = one_hot_encode_morphemes(verbs)
verbs

0  /  12
1  /  12
2  /  12
3  /  12
4  /  12
5  /  12
6  /  12
7  /  12
8  /  12
9  /  12
10  /  12
11  /  12


Unnamed: 0,word,stem,COND,COND<PAST>,DEF,INF,MODAL,PAST,PERS,PERS<1<OBJ<2>>>,PERS<1>,PERS<2>,PLUR,SUBJUNC-IMP
0,abajgasd,abajgat,0,0,1,0,0,0,0,0,0,1,0,1
1,abajgassák,abajgat,0,0,1,0,0,0,0,0,0,0,1,1
2,abajgat,abajgat,0,0,0,0,0,0,0,0,0,0,0,0
3,abajgatja,abajgat,0,0,1,0,0,0,0,0,0,0,0,0
4,abajgatják,abajgat,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382350,üzérkednek,üzérkedik,0,0,0,0,0,0,0,0,0,0,1,0
382351,üzérkedni,üzérkedik,0,0,0,1,0,0,0,0,0,0,0,0
382352,üzérkedsz,üzérkedik,0,0,0,0,0,0,0,0,0,1,0,0
382353,üzérkedtek,üzérkedik,0,0,0,0,0,1,0,0,0,0,1,0


In [331]:
verbs.to_csv('verbs.csv')

In [94]:
adjectives = one_hot_encode_morphemes(adjectives)
adjectives

0  /  28
1  /  28
2  /  28
3  /  28
4  /  28
5  /  28
6  /  28
7  /  28
8  /  28
9  /  28
10  /  28
11  /  28
12  /  28
13  /  28
14  /  28
15  /  28
16  /  28
17  /  28
18  /  28
19  /  28
20  /  28
21  /  28
22  /  28
23  /  28
24  /  28
25  /  28
26  /  28
27  /  28


Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,CAS<TER>,CAS<TRA>,PLUR,PLUR<FAM>,POSS,POSS<1>,POSS<1><PLUR>,POSS<2>,POSS<2><PLUR>,POSS<PLUR>
0,526,abbeli,abbeli,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,770,aberrált,aberrált,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,771,aberráltak,aberrált,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,772,aberráltakat,aberrált,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,773,aberráltját,aberrált,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83629,4431797,üzletivé,üzleti,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83630,4432753,üzletszerű,üzletszerű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83631,4432754,üzletszerűek,üzletszerű,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
83632,4432755,üzletszerűnek,üzletszerű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
adjectives.to_csv('adjectives.csv')

In [337]:
determiners = one_hot_encode_morphemes(determiners)
determiners

0  /  20
1  /  20
2  /  20
3  /  20
4  /  20
5  /  20
6  /  20
7  /  20
8  /  20
9  /  20
10  /  20
11  /  20
12  /  20
13  /  20
14  /  20
15  /  20
16  /  20
17  /  20
18  /  20
19  /  20


Unnamed: 0,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,CAS<DAT>,...,CAS<FOR>,CAS<ILL>,CAS<INE>,CAS<INS>,CAS<SBL>,CAS<SUE>,CAS<TEM>,CAS<TRA>,PLUR,POSS<2><PLUR>
0,akként,az,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ama,ama,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,amannak,amaz,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,amannál,amaz,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,amazok,amaz,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,ugyanazoknak,ugyanaz,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
64,ugyanazon,ugyanazon,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,ugyanekkor,ugyanez,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
66,ugyanezeken,ugyanez,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [338]:
determiners.to_csv('determiners.csv')

### -----------------------------------------------------------------------------------------

In [339]:
verbs

Unnamed: 0,word,stem,COND,COND<PAST>,DEF,INF,MODAL,PAST,PERS,PERS<1<OBJ<2>>>,PERS<1>,PERS<2>,PLUR,SUBJUNC-IMP
0,abajgasd,abajgat,0,0,1,0,0,0,0,0,0,1,0,1
1,abajgassák,abajgat,0,0,1,0,0,0,0,0,0,0,1,1
2,abajgat,abajgat,0,0,0,0,0,0,0,0,0,0,0,0
3,abajgatja,abajgat,0,0,1,0,0,0,0,0,0,0,0,0
4,abajgatják,abajgat,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382350,üzérkednek,üzérkedik,0,0,0,0,0,0,0,0,0,0,1,0
382351,üzérkedni,üzérkedik,0,0,0,1,0,0,0,0,0,0,0,0
382352,üzérkedsz,üzérkedik,0,0,0,0,0,0,0,0,0,1,0,0
382353,üzérkedtek,üzérkedik,0,0,0,0,0,1,0,0,0,0,1,0


In [342]:
verbs[verbs['word'] == 'csináltatok']

Unnamed: 0,word,stem,COND,COND<PAST>,DEF,INF,MODAL,PAST,PERS,PERS<1<OBJ<2>>>,PERS<1>,PERS<2>,PLUR,SUBJUNC-IMP
39965,csináltatok,csinál,0,0,0,0,0,1,0,0,0,1,1,0
39966,csináltatok,csináltat,0,0,0,0,0,0,0,0,1,0,0,0


In [349]:
s = ''
wordlist = list(words_df['word'])
for w in range(1,len(wordlist)):
    s += wordlist[w]
L = sorted(list(set(s)))
encode_dict = {}
decode_dict = {}
N = len(L)
for c in range(N):
    encode_dict[L[c]] = c
    decode_dict[c] = L[c]

print(encode_dict)
print(decode_dict)

{' ': 0, '!': 1, '%': 2, '&': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '5': 16, '6': 17, '7': 18, '8': 19, '9': 20, ':': 21, ';': 22, '=': 23, '[': 24, '\\': 25, ']': 26, '_': 27, 'a': 28, 'b': 29, 'c': 30, 'd': 31, 'e': 32, 'f': 33, 'g': 34, 'h': 35, 'i': 36, 'j': 37, 'k': 38, 'l': 39, 'm': 40, 'n': 41, 'o': 42, 'p': 43, 'q': 44, 'r': 45, 's': 46, 't': 47, 'u': 48, 'v': 49, 'w': 50, 'x': 51, 'y': 52, 'z': 53, '{': 54, '|': 55, '}': 56, '~': 57, '\x7f': 58, '\x81': 59, '\x83': 60, '\x84': 61, '\x85': 62, '\x86': 63, '\x87': 64, '\x88': 65, '\x89': 66, '\x8a': 67, '\x8c': 68, '\x8d': 69, '\x8e': 70, '\x8f': 71, '\x90': 72, '\x92': 73, '\x94': 74, '\x95': 75, '\x96': 76, '\x97': 77, '\x98': 78, '\x9a': 79, '\x9b': 80, '\x9c': 81, '\x9d': 82, '\x9e': 83, '\x9f': 84, '\xa0': 85, '¤': 86, '§': 87, '¨': 88, '\xad': 89, '°': 90, '´': 91, '¸': 92, '×': 93, 'ß': 94, 'á': 95, 'â': 96, 'ä': 97, 'ç': 98, 'é': 99, 'ë': 100, 'í': 101, '

In [352]:
''.join(L)

" !%&'()*,-.0123456789:;=[\\]_abcdefghijklmnopqrstuvwxyz{|}~\x7f\x81\x83\x84\x85\x86\x87\x88\x89\x8a\x8c\x8d\x8e\x8f\x90\x92\x94\x95\x96\x97\x98\x9a\x9b\x9c\x9d\x9e\x9f\xa0¤§¨\xad°´¸×ßáâäçéëíîóôö÷úüýăąćčďđęěĺľłńňőŕřśşšţťůűźżžˇ˘˛˝"

### ------------------------------------------------------------------------------------

In [9]:
unwanted_characters = '\x7f\x81\x83\x84\x85\x86\x87\x88\x89\x8a\x8c\x8d\x8e\x8f\x90\x92\x94\x95\x96\x97\x98\x9a\x9b\x9c\x9d\x9e\x9f\xa0¤§¨\xad°´¸×ßâçëî÷ýăąćčďđęěĺľłńňŕřśşšţťůźżžôˇ˘˛˝'

In [5]:
def drop_unwanted(df):
    unwanted_word_indicies = []
    all_words = list(df['word'])
    for i in range(len(all_words)):
        try:
            for c in unwanted_characters:
                if c in all_words[i]:
                    unwanted_word_indicies.append(i)
        except TypeError:
            unwanted_word_indicies.append(i)
    df.drop(unwanted_word_indicies, inplace=True)
    df.reset_index(inplace=True)
    return df

In [6]:
nouns = pd.read_csv('nouns.csv')
nouns.drop(['Unnamed: 0'], axis=1, inplace=True)

In [7]:
nouns

Unnamed: 0,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,CAS<DAT>,...,POSTP<RÉSZÉRE>,POSTP<RÉSZÉRŐL>,POSTP<SZERINT>,POSTP<SZÁMÁRA>,POSTP<UTÁN>,POSTP<VÉGBŐL>,POSTP<VÉGETT>,POSTP<VÉGRE>,POSTP<ÁLTAL>,POSTP<ÓTA>
0,aa,aa,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,aa-bb,aa-bb,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,aa-lávaként,aa-láv,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,aa-val,aa-val,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,aa.,aa.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3963996,üönképzőkör,üönképzőkör,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3963997,üötvefúróhoz,üötvefúró,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3963998,üúcijöcuxz,üúcijöcuxz,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3963999,üű,üű,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
drop_unwanted(nouns)

Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,POSTP<RÉSZÉRE>,POSTP<RÉSZÉRŐL>,POSTP<SZERINT>,POSTP<SZÁMÁRA>,POSTP<UTÁN>,POSTP<VÉGBŐL>,POSTP<VÉGETT>,POSTP<VÉGRE>,POSTP<ÁLTAL>,POSTP<ÓTA>
0,0,aa,aa,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,aa-bb,aa-bb,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,aa-lávaként,aa-láv,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,aa-val,aa-val,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,aa.,aa.,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3942931,3963996,üönképzőkör,üönképzőkör,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3942932,3963997,üötvefúróhoz,üötvefúró,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3942933,3963998,üúcijöcuxz,üúcijöcuxz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3942934,3963999,üű,üű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
nouns.to_csv('nouns2.csv', index=False)

In [12]:
verbs = pd.read_csv('verbs.csv')
verbs.drop(['Unnamed: 0'], axis=1, inplace=True)

In [13]:
drop_unwanted(verbs)

Unnamed: 0,index,word,stem,COND,COND<PAST>,DEF,INF,MODAL,PAST,PERS,PERS<1<OBJ<2>>>,PERS<1>,PERS<2>,PLUR,SUBJUNC-IMP
0,0,abajgasd,abajgat,0,0,1,0,0,0,0,0,0,1,0,1
1,1,abajgassák,abajgat,0,0,1,0,0,0,0,0,0,0,1,1
2,2,abajgat,abajgat,0,0,0,0,0,0,0,0,0,0,0,0
3,3,abajgatja,abajgat,0,0,1,0,0,0,0,0,0,0,0,0
4,4,abajgatják,abajgat,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382350,382350,üzérkednek,üzérkedik,0,0,0,0,0,0,0,0,0,0,1,0
382351,382351,üzérkedni,üzérkedik,0,0,0,1,0,0,0,0,0,0,0,0
382352,382352,üzérkedsz,üzérkedik,0,0,0,0,0,0,0,0,0,1,0,0
382353,382353,üzérkedtek,üzérkedik,0,0,0,0,0,1,0,0,0,0,1,0


In [14]:
verbs.to_csv('verbs2.csv', index=False)

In [None]:
adjectives = pd.read_csv('adjectives.csv')
adjectives.drop(['Unnamed: 0'], axis=1, inplace=True)

In [97]:
drop_unwanted(adjectives)

Unnamed: 0,level_0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,...,CAS<TER>,CAS<TRA>,PLUR,PLUR<FAM>,POSS,POSS<1>,POSS<1><PLUR>,POSS<2>,POSS<2><PLUR>,POSS<PLUR>
0,0,526,abbeli,abbeli,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,770,aberrált,aberrált,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,771,aberráltak,aberrált,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,772,aberráltakat,aberrált,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4,773,aberráltját,aberrált,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83628,83629,4431797,üzletivé,üzleti,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83629,83630,4432753,üzletszerű,üzletszerű,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83630,83631,4432754,üzletszerűek,üzletszerű,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
83631,83632,4432755,üzletszerűnek,üzletszerű,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
adjectives.to_csv('adjectives2.csv', index=False)

In [18]:
nums = pd.read_csv('nums.csv')
nums.drop(['Unnamed: 0'], axis=1, inplace=True)

In [109]:
drop_unwanted(nums)

Unnamed: 0,level_0,index,word,stem,morphemes
0,0,40951,ahányan,ahány,NUM[AGGREG]
1,1,40959,ahányszorta,ahány,NUM[MULTIPL]
2,2,56431,akárhanyan,akárhány,NUM[AGGREG]
3,3,56447,akárhányadszorra,akárhány,NUM[ORD-ITER-ACCOMPL]
4,4,56448,akárhányan,akárhány,NUM[AGGREG]
...,...,...,...,...,...
76,76,4131167,öten,öt,NUM[AGGREG]
77,77,4219084,édeskevesen,édeskevés,NUM[AGGREG]
78,78,4376737,öten,öt,NUM[AGGREG]
79,79,4378357,ötszörte,öt,NUM[MULTIPL]


In [111]:
#nums.drop(['level_0','index'],axis=1,inplace=True)

In [20]:
nums.to_csv('nums2.csv', index=False)

In [21]:
determiners = pd.read_csv('determiners.csv')
determiners.drop(['Unnamed: 0'], axis=1, inplace=True)

In [22]:
drop_unwanted(determiners)

Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,CAS<FOR>,CAS<ILL>,CAS<INE>,CAS<INS>,CAS<SBL>,CAS<SUE>,CAS<TEM>,CAS<TRA>,PLUR,POSS<2><PLUR>
0,0,akként,az,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,ama,ama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,amannak,amaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,amannál,amaz,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,amazok,amaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,63,ugyanazoknak,ugyanaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
64,64,ugyanazon,ugyanazon,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,65,ugyanekkor,ugyanez,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
66,66,ugyanezeken,ugyanez,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [23]:
determiners.to_csv('determiners2.csv')

In [30]:
chars = " 0123456789.:,;!%&'*_-=~\\()|[]{}aáäbcdeéfghiíjklmnoóöőpqrstuúüűvwxyz"

In [31]:
encode_dict = {}
decode_dict = {}

for c in range(len(chars)):
    encode_dict[chars[c]] = c
    decode_dict[c] = chars[c]

print(encode_dict)
print(decode_dict)

{' ': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, '.': 11, ':': 12, ',': 13, ';': 14, '!': 15, '%': 16, '&': 17, "'": 18, '*': 19, '_': 20, '-': 21, '=': 22, '~': 23, '\\': 24, '(': 25, ')': 26, '|': 27, '[': 28, ']': 29, '{': 30, '}': 31, 'a': 32, 'á': 33, 'ä': 34, 'b': 35, 'c': 36, 'd': 37, 'e': 38, 'é': 39, 'f': 40, 'g': 41, 'h': 42, 'i': 43, 'í': 44, 'j': 45, 'k': 46, 'l': 47, 'm': 48, 'n': 49, 'o': 50, 'ó': 51, 'ö': 52, 'ő': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'ú': 60, 'ü': 61, 'ű': 62, 'v': 63, 'w': 64, 'x': 65, 'y': 66, 'z': 67}
{0: ' ', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: '.', 12: ':', 13: ',', 14: ';', 15: '!', 16: '%', 17: '&', 18: "'", 19: '*', 20: '_', 21: '-', 22: '=', 23: '~', 24: '\\', 25: '(', 26: ')', 27: '|', 28: '[', 29: ']', 30: '{', 31: '}', 32: 'a', 33: 'á', 34: 'ä', 35: 'b', 36: 'c', 37: 'd', 38: 'e', 39: 'é', 40: 'f', 41: 'g', 42: 'h', 43: 'i', 44: 'í',

In [32]:
def encode(w):
    ret = []
    for c in w:
        ret.append(encode_dict[c])
    return np.array(ret)

In [34]:
encode('itt vagyok, menjünk!')

array([43, 58, 58,  0, 63, 32, 41, 66, 50, 46, 13,  0, 48, 38, 49, 45, 61,
       49, 46, 15])

In [33]:
def decode(a):
    ret = []
    for i in a:
        ret.append(decode_dict[i])
    return ''.join(ret)

In [35]:
decode([43, 58, 58,  0, 63, 32, 41, 66, 50, 46, 13,  0, 48, 38, 49, 45, 61,
       49, 46, 15])

'itt vagyok, menjünk!'

In [46]:
all_words = words_df['word']
max([len(w) for w in all_words])

1000

In [62]:
len([len(w) for w in all_words if len(w) >= 44])

497

In [59]:
[w for w in all_words if len(w) == 40]

['alternatvizmus-liberalizmus-radikalizmus',
 'banán-alma-cseresznye-kalabár-együttesek',
 'behívás-nyilvántartás-ellenőrzés-ellátás',
 'bevallhatatlansága-földolgozhatatlansága',
 'bulldog-masztiff-pitbull-németjuhász-box',
 'bááááááááááááááááááááááááááááááááááááááá',
 'bőrgyógyászati-kozmentológia-venerológia',
 'egészségneveléshez-egészségfejlesztéshez',
 'elmélet&ndash;módszertan&ndash;gyakorlat',
 'energia-visszanyeréssel(-hasznosítással)',
 'feleségével-üzlettársával-kolléganőjével',
 'felnőttképzésben&ndash;felnőttoktatásban',
 'festőművész&ndash;szobrászművész-oktatói',
 'fideszvagytudomisénhogyhívjákmostőket-re',
 'fizikus&mdash;geométer&mdash;matematikus',
 'francia-német-brit-spanyol-olasz-lengyel',
 'földtani-hidrogeológiai-környezetállapot',
 'gazdaságfejlesztés-vállalkozásfejlesztés',
 'gyurcsány-ferenczklaudia-mobilitás-gyism',
 'gyógyszerfejlesztés-gyógyszervizsgálatok',
 'gépjárművezetőiengedély-nyilvántartásban',
 'hatezrelékes-roma-felsőoktatásbeli-arány',
 'hatszázn

In [61]:
len('megszentségteleníthetetlenségeskedéseitekért')

44

In [None]:
def keep_short_words(df):
    df_words = list(df['word'])
    L = len(df_words)
    indicies_to_remove = []
    for i in range(L):
        if len(df_words[i]) > 44:
            indicies_to_remove.append(i)
    df.drop(indicies_to_remove,inplace=True)
    df = df.reset_index()
    return df

In [64]:
keep_short_words(nouns)

Unnamed: 0,level_0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,...,POSTP<RÉSZÉRE>,POSTP<RÉSZÉRŐL>,POSTP<SZERINT>,POSTP<SZÁMÁRA>,POSTP<UTÁN>,POSTP<VÉGBŐL>,POSTP<VÉGETT>,POSTP<VÉGRE>,POSTP<ÁLTAL>,POSTP<ÓTA>
0,0,0,aa,aa,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,aa-bb,aa-bb,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,aa-lávaként,aa-láv,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,aa-val,aa-val,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,aa.,aa.,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3942512,3942931,3963996,üönképzőkör,üönképzőkör,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3942513,3942932,3963997,üötvefúróhoz,üötvefúró,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3942514,3942933,3963998,üúcijöcuxz,üúcijöcuxz,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3942515,3942934,3963999,üű,üű,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
nouns.to_csv('nouns3.csv', index=False)

In [66]:
keep_short_words(verbs)

Unnamed: 0,level_0,index,word,stem,COND,COND<PAST>,DEF,INF,MODAL,PAST,PERS,PERS<1<OBJ<2>>>,PERS<1>,PERS<2>,PLUR,SUBJUNC-IMP
0,0,0,abajgasd,abajgat,0,0,1,0,0,0,0,0,0,1,0,1
1,1,1,abajgassák,abajgat,0,0,1,0,0,0,0,0,0,0,1,1
2,2,2,abajgat,abajgat,0,0,0,0,0,0,0,0,0,0,0,0
3,3,3,abajgatja,abajgat,0,0,1,0,0,0,0,0,0,0,0,0
4,4,4,abajgatják,abajgat,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382350,382350,382350,üzérkednek,üzérkedik,0,0,0,0,0,0,0,0,0,0,1,0
382351,382351,382351,üzérkedni,üzérkedik,0,0,0,1,0,0,0,0,0,0,0,0
382352,382352,382352,üzérkedsz,üzérkedik,0,0,0,0,0,0,0,0,0,1,0,0
382353,382353,382353,üzérkedtek,üzérkedik,0,0,0,0,0,1,0,0,0,0,1,0


In [67]:
verbs.to_csv('verbs3.csv', index=False)

In [102]:
#adjectives.drop(['level_0','index'],axis=1,inplace=True)

In [103]:
keep_short_words(adjectives)

Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,CAS<TER>,CAS<TRA>,PLUR,PLUR<FAM>,POSS,POSS<1>,POSS<1><PLUR>,POSS<2>,POSS<2><PLUR>,POSS<PLUR>
0,0,abbeli,abbeli,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,aberrált,aberrált,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,aberráltak,aberrált,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,aberráltakat,aberrált,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4,aberráltját,aberrált,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83628,83628,üzletivé,üzleti,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83629,83629,üzletszerű,üzletszerű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83630,83630,üzletszerűek,üzletszerű,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
83631,83631,üzletszerűnek,üzletszerű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
adjectives.to_csv('adjectives3.csv', index=False)

In [112]:
keep_short_words(nums)

Unnamed: 0,index,word,stem,morphemes
0,0,ahányan,ahány,NUM[AGGREG]
1,1,ahányszorta,ahány,NUM[MULTIPL]
2,2,akárhanyan,akárhány,NUM[AGGREG]
3,3,akárhányadszorra,akárhány,NUM[ORD-ITER-ACCOMPL]
4,4,akárhányan,akárhány,NUM[AGGREG]
...,...,...,...,...
76,76,öten,öt,NUM[AGGREG]
77,77,édeskevesen,édeskevés,NUM[AGGREG]
78,78,öten,öt,NUM[AGGREG]
79,79,ötszörte,öt,NUM[MULTIPL]


In [114]:
nums.drop(['index'],axis=1,inplace=True)

In [115]:
nums

Unnamed: 0,word,stem,morphemes
0,ahányan,ahány,NUM[AGGREG]
1,ahányszorta,ahány,NUM[MULTIPL]
2,akárhanyan,akárhány,NUM[AGGREG]
3,akárhányadszorra,akárhány,NUM[ORD-ITER-ACCOMPL]
4,akárhányan,akárhány,NUM[AGGREG]
...,...,...,...
76,öten,öt,NUM[AGGREG]
77,édeskevesen,édeskevés,NUM[AGGREG]
78,öten,öt,NUM[AGGREG]
79,ötszörte,öt,NUM[MULTIPL]


In [116]:
nums.to_csv('nums3.csv', index=False)

In [91]:
keep_short_words(determiners)

Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,CAS<FOR>,CAS<ILL>,CAS<INE>,CAS<INS>,CAS<SBL>,CAS<SUE>,CAS<TEM>,CAS<TRA>,PLUR,POSS<2><PLUR>
0,0,akként,az,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,ama,ama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,amannak,amaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,amannál,amaz,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,amazok,amaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,63,ugyanazoknak,ugyanaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
64,64,ugyanazon,ugyanazon,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,65,ugyanekkor,ugyanez,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
66,66,ugyanezeken,ugyanez,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [92]:
determiners.to_csv('determiners3.csv', index=False)

In [None]:
'''
M = len(encode_dict)
W = 44
def one_hot_encode(w):
    e = encode(w)
    out = [0 for i in range(M)]
    for j in range(len(w)):
'''

In [118]:
nouns.drop(['level_0','index'],axis=1,inplace=True)

In [119]:
nouns.to_csv('nouns3.csv',index=False)

In [120]:
verbs

Unnamed: 0,level_0,index,word,stem,COND,COND<PAST>,DEF,INF,MODAL,PAST,PERS,PERS<1<OBJ<2>>>,PERS<1>,PERS<2>,PLUR,SUBJUNC-IMP
0,0,0,abajgasd,abajgat,0,0,1,0,0,0,0,0,0,1,0,1
1,1,1,abajgassák,abajgat,0,0,1,0,0,0,0,0,0,0,1,1
2,2,2,abajgat,abajgat,0,0,0,0,0,0,0,0,0,0,0,0
3,3,3,abajgatja,abajgat,0,0,1,0,0,0,0,0,0,0,0,0
4,4,4,abajgatják,abajgat,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382350,382350,382350,üzérkednek,üzérkedik,0,0,0,0,0,0,0,0,0,0,1,0
382351,382351,382351,üzérkedni,üzérkedik,0,0,0,1,0,0,0,0,0,0,0,0
382352,382352,382352,üzérkedsz,üzérkedik,0,0,0,0,0,0,0,0,0,1,0,0
382353,382353,382353,üzérkedtek,üzérkedik,0,0,0,0,0,1,0,0,0,0,1,0


In [121]:
verbs.drop(['level_0','index'],axis=1,inplace=True)

In [122]:
verbs.to_csv('verbs3.csv',index=False)

In [123]:
adjectives

Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,CAS<TER>,CAS<TRA>,PLUR,PLUR<FAM>,POSS,POSS<1>,POSS<1><PLUR>,POSS<2>,POSS<2><PLUR>,POSS<PLUR>
0,0,abbeli,abbeli,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,aberrált,aberrált,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,aberráltak,aberrált,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,aberráltakat,aberrált,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4,aberráltját,aberrált,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83628,83628,üzletivé,üzleti,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83629,83629,üzletszerű,üzletszerű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83630,83630,üzletszerűek,üzletszerű,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
83631,83631,üzletszerűnek,üzletszerű,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
adjectives.drop(['index'],axis=1,inplace=True)

In [125]:
adjectives.to_csv('adjectives3.csv',index=False)

In [126]:
nums

Unnamed: 0,word,stem,morphemes
0,ahányan,ahány,NUM[AGGREG]
1,ahányszorta,ahány,NUM[MULTIPL]
2,akárhanyan,akárhány,NUM[AGGREG]
3,akárhányadszorra,akárhány,NUM[ORD-ITER-ACCOMPL]
4,akárhányan,akárhány,NUM[AGGREG]
...,...,...,...
76,öten,öt,NUM[AGGREG]
77,édeskevesen,édeskevés,NUM[AGGREG]
78,öten,öt,NUM[AGGREG]
79,ötszörte,öt,NUM[MULTIPL]


In [127]:
determiners

Unnamed: 0,index,word,stem,ANP,ANP<PLUR>,CAS<ABL>,CAS<ACC>,CAS<ADE>,CAS<ALL>,CAS<CAU>,...,CAS<FOR>,CAS<ILL>,CAS<INE>,CAS<INS>,CAS<SBL>,CAS<SUE>,CAS<TEM>,CAS<TRA>,PLUR,POSS<2><PLUR>
0,0,akként,az,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,ama,ama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,amannak,amaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,amannál,amaz,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,amazok,amaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,63,ugyanazoknak,ugyanaz,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
64,64,ugyanazon,ugyanazon,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,65,ugyanekkor,ugyanez,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
66,66,ugyanezeken,ugyanez,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [128]:
determiners.drop(['index'],axis=1,inplace=True)

In [129]:
determiners.to_csv('determiners3.csv',index=False)

In [130]:
other

NameError: name 'other' is not defined