## Word Type recognition using a Neural Network

This Notebook accomplishes one thing: Builds up a model that's capable of distinguishing between the following word types: nouns, verbs, adjectives, determiners and numbers. For the rest of the word types, sufficient data wasn't available, so I used a non-Neural approach found in the following Notebook: A set-similarity method.

This file also doesn't need to be run by the user - the final trained model is saved in the same folder.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import random
import math
random_state = 777

Using TensorFlow backend.


In [2]:
nouns = pd.read_csv('nouns4.csv')

In [3]:
nouns_list = list(nouns['word'])

In [20]:
len(nouns_list)

3942517

In [5]:
del nouns

In [6]:
verbs = pd.read_csv('verbs3.csv')

In [7]:
verbs_list = list(verbs['word'])

In [21]:
len(verbs_list)

382355

In [8]:
del verbs

In [9]:
adjectives = pd.read_csv('adjectives3.csv')

In [10]:
adjectives_list = list(adjectives['word'])

In [22]:
len(adjectives_list)

83633

In [11]:
del adjectives

In [12]:
nums = pd.read_csv('nums4.csv')

In [13]:
nums_list = list(nums['word'])

In [14]:
del nums

In [15]:
determiners = pd.read_csv('determiners3.csv')

In [16]:
determiners_list = list(determiners['word'])

In [17]:
del determiners

In [75]:
all_words = list(set(nouns_list + verbs_list + adjectives_list + nums_list + determiners_list))
frequencies = [[None for i in range(len(all_words))] for j in range(5)]

In [68]:
len(all_words)

4237626

In [35]:
word_categories = [nouns_list,verbs_list,adjectives_list,nums_list,determiners_list]

In [72]:
word_sets = [set(cat) for cat in word_categories]
combinations = [[0 for k in range(4 - int(math.log(n,2)))] + [int(i) for i in list('{0:0b}'.format(n))] for n in range(1,32)]

In [74]:
sorted(combinations, reverse=True)

[[1, 1, 1, 1, 1],
 [1, 1, 1, 1, 0],
 [1, 1, 1, 0, 1],
 [1, 1, 1, 0, 0],
 [1, 1, 0, 1, 1],
 [1, 1, 0, 1, 0],
 [1, 1, 0, 0, 1],
 [1, 1, 0, 0, 0],
 [1, 0, 1, 1, 1],
 [1, 0, 1, 1, 0],
 [1, 0, 1, 0, 1],
 [1, 0, 1, 0, 0],
 [1, 0, 0, 1, 1],
 [1, 0, 0, 1, 0],
 [1, 0, 0, 0, 1],
 [1, 0, 0, 0, 0],
 [0, 1, 1, 1, 1],
 [0, 1, 1, 1, 0],
 [0, 1, 1, 0, 1],
 [0, 1, 1, 0, 0],
 [0, 1, 0, 1, 1],
 [0, 1, 0, 1, 0],
 [0, 1, 0, 0, 1],
 [0, 1, 0, 0, 0],
 [0, 0, 1, 1, 1],
 [0, 0, 1, 1, 0],
 [0, 0, 1, 0, 1],
 [0, 0, 1, 0, 0],
 [0, 0, 0, 1, 1],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1]]

In [101]:
word_types = {}
for comb in sorted(combinations, reverse=True):
    current_selection = [word_sets[i] for i in range(len(word_sets)) if comb[i] == 1]
    current_intersection = current_selection[0].intersection(*current_selection[1:])
    sum_comb = sum(comb)
    probs = [c / sum_comb for c in comb]
    for w in list(current_intersection):
        if w not in word_types:
            word_types[w] = probs

In [102]:
word_types

{'degenerált': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'szabad': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'tébolyultak': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'csoportosul': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'csendesül': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'jelennél': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'felkentek': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'kisemmizett': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'parázna': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'érdekeltek': [0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0],
 'lakták': [0.3333333333333333,
  0.3333333333333333,
  0.

In [112]:
word_types_df = pd.DataFrame.from_dict(word_types, orient='index',
                       columns = ['noun_freq', 'verb_freq', 'adjective_freq', 'determiner_freq', 'num_freq'])

In [113]:
word_types_df.head()

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq
degenerált,0.333333,0.333333,0.333333,0.0,0.0
szabad,0.333333,0.333333,0.333333,0.0,0.0
tébolyultak,0.333333,0.333333,0.333333,0.0,0.0
csoportosul,0.333333,0.333333,0.333333,0.0,0.0
csendesül,0.333333,0.333333,0.333333,0.0,0.0


In [114]:
word_types_df['word'] = word_types_df.index

In [115]:
word_types_df.head()

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
degenerált,0.333333,0.333333,0.333333,0.0,0.0,degenerált
szabad,0.333333,0.333333,0.333333,0.0,0.0,szabad
tébolyultak,0.333333,0.333333,0.333333,0.0,0.0,tébolyultak
csoportosul,0.333333,0.333333,0.333333,0.0,0.0,csoportosul
csendesül,0.333333,0.333333,0.333333,0.0,0.0,csendesül


In [116]:
word_types_df.reset_index(inplace=True, drop=True)

In [117]:
word_types_df

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,0.333333,0.333333,0.333333,0.0,0.0,degenerált
1,0.333333,0.333333,0.333333,0.0,0.0,szabad
2,0.333333,0.333333,0.333333,0.0,0.0,tébolyultak
3,0.333333,0.333333,0.333333,0.0,0.0,csoportosul
4,0.333333,0.333333,0.333333,0.0,0.0,csendesül
...,...,...,...,...,...,...
4237621,0.000000,0.000000,0.000000,0.0,1.0,azéért
4237622,0.000000,0.000000,0.000000,0.0,1.0,ezekére
4237623,0.000000,0.000000,0.000000,0.0,1.0,mindeme
4237624,0.000000,0.000000,0.000000,0.0,1.0,eme


In [118]:
word_types_df.to_csv('word_types_df.csv', index=False)

## ---------------------------------------------

In [2]:
word_types_df = pd.read_csv('word_types_df.csv')

In [3]:
word_types_df

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,0.333333,0.333333,0.333333,0.0,0.0,degenerált
1,0.333333,0.333333,0.333333,0.0,0.0,szabad
2,0.333333,0.333333,0.333333,0.0,0.0,tébolyultak
3,0.333333,0.333333,0.333333,0.0,0.0,csoportosul
4,0.333333,0.333333,0.333333,0.0,0.0,csendesül
...,...,...,...,...,...,...
4237621,0.000000,0.000000,0.000000,0.0,1.0,azéért
4237622,0.000000,0.000000,0.000000,0.0,1.0,ezekére
4237623,0.000000,0.000000,0.000000,0.0,1.0,mindeme
4237624,0.000000,0.000000,0.000000,0.0,1.0,eme


In [88]:
chars = " 0123456789.:,;!%&'*_-=~\\()|[]{}aáäbcdeéfghiíjklmnoóöőpqrstuúüűvwxyz"
encode_dict = {}
decode_dict = {}

for c in range(len(chars)):
    encode_dict[chars[c]] = c
    decode_dict[c] = chars[c]

def encode(w):
    ret = []
    for c in w:
        ret.append(encode_dict[c])
    return np.array(ret)

def decode(a):
    ret = []
    for i in a:
        ret.append(decode_dict[i])
    return ''.join(ret)

M = len(encode_dict)
W = 44
def one_hot_encode(w):
    e = encode(w)
    ohe = np.zeros((W, M))
    ohe[np.arange(len(e)),e] = 1
    return ohe 

def one_hot_encode_col(col):
    return np.array([one_hot_encode(w) for w in col])

In [6]:
for i in range(100):
    temp_types = word_types_df[int(len(word_types_df)*i/100):int(len(word_types_df)*(i+1)/100)].reset_index(drop=True)
    temp_types.to_csv('word_types_shorter_part_' + str(i) + '.csv', index=False)

In [10]:
inp = tf.keras.Input(shape=(44,68), name='input')

def conv_pool_layer(kernel_size,pool_size,input_layer,name):
    
    conv = tf.keras.layers.Conv1D(68,kernel_size=kernel_size,
        input_shape=(44,68), activation='relu', padding='same',
        kernel_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-3),
        bias_regularizer=regularizers.l2(1e-3),
        activity_regularizer=regularizers.l2(1e-4),
        data_format='channels_last', name='1D_conv' + name)(input_layer)
    
    pool = max_pool3 = tf.keras.layers.MaxPooling1D(pool_size=pool_size,
        data_format='channels_first', name= '1D_maxpool' + name)(conv)

    return pool

model1 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(1,1,inp,'1'))
model3 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(3,2,inp,'3'))
model5 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(5,2,inp,'5'))
model7 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(7,4,inp,'7'))
model9 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(9,4,inp,'9'))

combined = tf.keras.layers.concatenate([model1.output, model3.output, model5.output,
                                        model7.output, model9.output], name='concatenate')

flat = tf.keras.layers.Flatten(name='flatten')(combined)
drop = tf.keras.layers.Dropout(0.57,name='droput_0.57')(flat)
dense5 = tf.keras.layers.Dense(5, activation='softmax',name='dense5')(drop)

types_conv_model = tf.keras.Model(inputs=[model1.input],
                            outputs=dense5,name='Types_Multiconv_Model')
types_conv_model.output_shape



(None, 5)

In [11]:
types_conv_model.compile(optimizer='adam', loss='mse')

In [14]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

In [15]:
for i in range(100):
    
    print("STARTING PHASE " + str(i+1))
    
    temp_types = pd.read_csv('word_types_shorter_part_' + str(i) + '.csv')
    
    types_X = one_hot_encode_col(list(temp_types['word']))
    types_y = np.array(temp_types[temp_types.columns.difference(['word'])])
    
    types_X_train, types_X_test, types_y_train, types_y_test = train_test_split(types_X, types_y, test_size=0.1, random_state=random_state)
    
    types_conv_model.fit(types_X_train, types_y_train, callbacks=[early_stopping], epochs=1, batch_size=100, validation_split=0.05)
    
    types_conv_model.evaluate(types_X_test, types_y_test)
    
    del temp_types
    del types_X
    del types_y
    del types_X_train
    del types_X_test
    del types_y_train
    del types_y_test

STARTING PHASE 1
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 2
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 3
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 4
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 5
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 6
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 7
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 8
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 9
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 10
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 11
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 12
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 13
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 14
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 15
Train on 36231 samples, validate on 1907 samples
STAR

STARTING PHASE 70
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 71
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 72
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 73
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 74
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 75
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 76
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 77
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 78
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 79
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 80
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 81
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 82
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 83
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 84
Train on 36231 samples, validate on 1907 sam

In [16]:
types_conv_model.save('types_conv_model.h5')

In [28]:
for i in range(10):
    
    print("STARTING PHASE " + str(i+1))
    
    temp_types = pd.read_csv('word_types_shorter_part_' + str(i) + '.csv')
    
    types_X = one_hot_encode_col(list(temp_types['word']))
    types_y = np.array(temp_types[temp_types.columns.difference(['word'])])
    
    types_X_train, types_X_test, types_y_train, types_y_test = train_test_split(types_X, types_y, test_size=0.1, random_state=random_state)
    
    types_conv_model.fit(types_X_train, types_y_train, callbacks=[early_stopping], epochs=1, batch_size=100, validation_split=0.05)
    
    types_conv_model.evaluate(types_X_test, types_y_test)
    
    del temp_types
    del types_X
    del types_y
    del types_X_train
    del types_X_test
    del types_y_train
    del types_y_test

STARTING PHASE 1
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 2
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 3
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 4
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 5
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 6
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 7
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 8
Train on 36232 samples, validate on 1907 samples
STARTING PHASE 9
Train on 36231 samples, validate on 1907 samples
STARTING PHASE 10
Train on 36231 samples, validate on 1907 samples


In [29]:
def predict_word_type(w):
    nn_input = one_hot_encode_col([w])
    print(types_conv_model.predict(nn_input))

In [30]:
predict_word_type('csinál')

[[4.1469093e-07 4.8747199e-07 9.9999785e-01 4.8634178e-07 7.5786659e-07]]


## ---------

- Remove most words that can only be 1 type (nouns, verbs, adjectives)
- Shuffle the orginal database
- Then split into multiple parts
- Then run the NN with the new data

In [18]:
word_types_df = pd.read_csv('word_types_df.csv')

In [19]:
word_types_df

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,0.333333,0.333333,0.333333,0.0,0.0,degenerált
1,0.333333,0.333333,0.333333,0.0,0.0,szabad
2,0.333333,0.333333,0.333333,0.0,0.0,tébolyultak
3,0.333333,0.333333,0.333333,0.0,0.0,csoportosul
4,0.333333,0.333333,0.333333,0.0,0.0,csendesül
...,...,...,...,...,...,...
4237621,0.000000,0.000000,0.000000,0.0,1.0,azéért
4237622,0.000000,0.000000,0.000000,0.0,1.0,ezekére
4237623,0.000000,0.000000,0.000000,0.0,1.0,mindeme
4237624,0.000000,0.000000,0.000000,0.0,1.0,eme


In [28]:
remove_indicies = []
for i, row in word_types_df.iterrows():
    
    if i % 1000 == 0:
        print(i)
    
    if row['noun_freq'] == 1:
        if random.random() < 0.9:
            remove_indicies.append(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000


1165000
1166000
1167000
1168000
1169000
1170000
1171000
1172000
1173000
1174000
1175000
1176000
1177000
1178000
1179000
1180000
1181000
1182000
1183000
1184000
1185000
1186000
1187000
1188000
1189000
1190000
1191000
1192000
1193000
1194000
1195000
1196000
1197000
1198000
1199000
1200000
1201000
1202000
1203000
1204000
1205000
1206000
1207000
1208000
1209000
1210000
1211000
1212000
1213000
1214000
1215000
1216000
1217000
1218000
1219000
1220000
1221000
1222000
1223000
1224000
1225000
1226000
1227000
1228000
1229000
1230000
1231000
1232000
1233000
1234000
1235000
1236000
1237000
1238000
1239000
1240000
1241000
1242000
1243000
1244000
1245000
1246000
1247000
1248000
1249000
1250000
1251000
1252000
1253000
1254000
1255000
1256000
1257000
1258000
1259000
1260000
1261000
1262000
1263000
1264000
1265000
1266000
1267000
1268000
1269000
1270000
1271000
1272000
1273000
1274000
1275000
1276000
1277000
1278000
1279000
1280000
1281000
1282000
1283000
1284000
1285000
1286000
1287000
1288000
1289000


2190000
2191000
2192000
2193000
2194000
2195000
2196000
2197000
2198000
2199000
2200000
2201000
2202000
2203000
2204000
2205000
2206000
2207000
2208000
2209000
2210000
2211000
2212000
2213000
2214000
2215000
2216000
2217000
2218000
2219000
2220000
2221000
2222000
2223000
2224000
2225000
2226000
2227000
2228000
2229000
2230000
2231000
2232000
2233000
2234000
2235000
2236000
2237000
2238000
2239000
2240000
2241000
2242000
2243000
2244000
2245000
2246000
2247000
2248000
2249000
2250000
2251000
2252000
2253000
2254000
2255000
2256000
2257000
2258000
2259000
2260000
2261000
2262000
2263000
2264000
2265000
2266000
2267000
2268000
2269000
2270000
2271000
2272000
2273000
2274000
2275000
2276000
2277000
2278000
2279000
2280000
2281000
2282000
2283000
2284000
2285000
2286000
2287000
2288000
2289000
2290000
2291000
2292000
2293000
2294000
2295000
2296000
2297000
2298000
2299000
2300000
2301000
2302000
2303000
2304000
2305000
2306000
2307000
2308000
2309000
2310000
2311000
2312000
2313000
2314000


3216000
3217000
3218000
3219000
3220000
3221000
3222000
3223000
3224000
3225000
3226000
3227000
3228000
3229000
3230000
3231000
3232000
3233000
3234000
3235000
3236000
3237000
3238000
3239000
3240000
3241000
3242000
3243000
3244000
3245000
3246000
3247000
3248000
3249000
3250000
3251000
3252000
3253000
3254000
3255000
3256000
3257000
3258000
3259000
3260000
3261000
3262000
3263000
3264000
3265000
3266000
3267000
3268000
3269000
3270000
3271000
3272000
3273000
3274000
3275000
3276000
3277000
3278000
3279000
3280000
3281000
3282000
3283000
3284000
3285000
3286000
3287000
3288000
3289000
3290000
3291000
3292000
3293000
3294000
3295000
3296000
3297000
3298000
3299000
3300000
3301000
3302000
3303000
3304000
3305000
3306000
3307000
3308000
3309000
3310000
3311000
3312000
3313000
3314000
3315000
3316000
3317000
3318000
3319000
3320000
3321000
3322000
3323000
3324000
3325000
3326000
3327000
3328000
3329000
3330000
3331000
3332000
3333000
3334000
3335000
3336000
3337000
3338000
3339000
3340000


In [69]:
word_types_df2 = word_types_df[~word_types_df.index.isin(remove_indicies)]

In [70]:
word_types_df2.reset_index(inplace = True, drop = True)

In [71]:
word_types_df2

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,0.333333,0.333333,0.333333,0.0,0.0,degenerált
1,0.333333,0.333333,0.333333,0.0,0.0,szabad
2,0.333333,0.333333,0.333333,0.0,0.0,tébolyultak
3,0.333333,0.333333,0.333333,0.0,0.0,csoportosul
4,0.333333,0.333333,0.333333,0.0,0.0,csendesül
...,...,...,...,...,...,...
820473,0.000000,0.000000,0.000000,0.0,1.0,azéért
820474,0.000000,0.000000,0.000000,0.0,1.0,ezekére
820475,0.000000,0.000000,0.000000,0.0,1.0,mindeme
820476,0.000000,0.000000,0.000000,0.0,1.0,eme


In [72]:
word_types_df2.to_csv('word_types_df2.csv', index = False)

In [73]:
word_type_index = list(range(len(word_types_df2)))
random.shuffle(word_type_index)

In [74]:
word_type_index

[448412,
 500800,
 404097,
 317878,
 507099,
 104973,
 382364,
 350071,
 49047,
 172204,
 341835,
 568344,
 677973,
 618262,
 172798,
 601600,
 60258,
 216186,
 672960,
 776794,
 255181,
 515434,
 788763,
 249671,
 181882,
 16745,
 657429,
 424888,
 685851,
 68430,
 10240,
 262975,
 413582,
 100311,
 762357,
 374160,
 605125,
 348600,
 725872,
 33062,
 689347,
 186573,
 553693,
 706572,
 374236,
 1055,
 756614,
 631546,
 626085,
 278709,
 316294,
 642601,
 22178,
 14830,
 811269,
 390068,
 586209,
 428055,
 480001,
 721777,
 783228,
 741579,
 427406,
 795883,
 320495,
 436666,
 419095,
 40668,
 267479,
 226565,
 81018,
 180866,
 180201,
 64723,
 802256,
 184077,
 38645,
 751553,
 647156,
 268372,
 179497,
 377064,
 609407,
 483277,
 502550,
 525450,
 733217,
 151755,
 547944,
 739929,
 685646,
 697084,
 131015,
 185205,
 499491,
 501173,
 314125,
 494191,
 84110,
 480790,
 194877,
 117871,
 477648,
 251308,
 695389,
 401196,
 349901,
 215418,
 58292,
 386968,
 85052,
 196646,
 189890,


In [75]:
word_types_df2 = word_types_df2.reindex(word_type_index)

In [76]:
word_types_df2

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
448412,0.0,1.0,0.0,0.0,0.0,fakadhasson
500800,0.0,1.0,0.0,0.0,0.0,rugdostál
404097,0.0,1.0,0.0,0.0,0.0,eltunyulni
317878,1.0,0.0,0.0,0.0,0.0,megnyilvánulásaiként
507099,0.0,1.0,0.0,0.0,0.0,osztogassatok
...,...,...,...,...,...,...
538294,0.0,1.0,0.0,0.0,0.0,diszkrimináltak
139708,1.0,0.0,0.0,0.0,0.0,építészetszemlélet
246062,1.0,0.0,0.0,0.0,0.0,olvadékkal
574211,0.0,1.0,0.0,0.0,0.0,biztatnék


In [77]:
word_types_df2.reset_index(inplace = True, drop = True)

In [78]:
word_types_df2

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,0.0,1.0,0.0,0.0,0.0,fakadhasson
1,0.0,1.0,0.0,0.0,0.0,rugdostál
2,0.0,1.0,0.0,0.0,0.0,eltunyulni
3,1.0,0.0,0.0,0.0,0.0,megnyilvánulásaiként
4,0.0,1.0,0.0,0.0,0.0,osztogassatok
...,...,...,...,...,...,...
820473,0.0,1.0,0.0,0.0,0.0,diszkrimináltak
820474,1.0,0.0,0.0,0.0,0.0,építészetszemlélet
820475,1.0,0.0,0.0,0.0,0.0,olvadékkal
820476,0.0,1.0,0.0,0.0,0.0,biztatnék


In [79]:
word_types_df2.to_csv('word_types_df3.csv', index = False)

## ------------------------------------------

In [80]:
word_types_df3 = pd.read_csv('word_types_df3.csv')

In [81]:
word_types_df3

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,0.0,1.0,0.0,0.0,0.0,fakadhasson
1,0.0,1.0,0.0,0.0,0.0,rugdostál
2,0.0,1.0,0.0,0.0,0.0,eltunyulni
3,1.0,0.0,0.0,0.0,0.0,megnyilvánulásaiként
4,0.0,1.0,0.0,0.0,0.0,osztogassatok
...,...,...,...,...,...,...
820473,0.0,1.0,0.0,0.0,0.0,diszkrimináltak
820474,1.0,0.0,0.0,0.0,0.0,építészetszemlélet
820475,1.0,0.0,0.0,0.0,0.0,olvadékkal
820476,0.0,1.0,0.0,0.0,0.0,biztatnék


In [83]:
for i in range(10):
    temp_types = word_types_df3[int(len(word_types_df3)*i/10):int(len(word_types_df3)*(i+1)/10)].reset_index(drop=True)
    temp_types.to_csv('word_types3_shorter_part_' + str(i) + '.csv', index=False)

In [84]:
inp = tf.keras.Input(shape=(44,68), name='input')

def conv_pool_layer(kernel_size,pool_size,input_layer,name):
    
    conv = tf.keras.layers.Conv1D(68,kernel_size=kernel_size,
        input_shape=(44,68), activation='relu', padding='same',
        kernel_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-3),
        bias_regularizer=regularizers.l2(1e-3),
        activity_regularizer=regularizers.l2(1e-4),
        data_format='channels_last', name='1D_conv' + name)(input_layer)
    
    pool = max_pool3 = tf.keras.layers.MaxPooling1D(pool_size=pool_size,
        data_format='channels_first', name= '1D_maxpool' + name)(conv)

    return pool

model1 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(1,1,inp,'1'))
model3 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(3,2,inp,'3'))
model5 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(5,2,inp,'5'))
model7 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(7,4,inp,'7'))
model9 = tf.keras.Model(inputs=inp, outputs=conv_pool_layer(9,4,inp,'9'))

combined = tf.keras.layers.concatenate([model1.output, model3.output, model5.output,
                                        model7.output, model9.output], name='concatenate')

flat = tf.keras.layers.Flatten(name='flatten')(combined)
drop = tf.keras.layers.Dropout(0.57,name='droput_0.57')(flat)
dense5 = tf.keras.layers.Dense(5, activation='softmax',name='dense5')(drop)

types_conv_model = tf.keras.Model(inputs=[model1.input],
                            outputs=dense5,name='Types_Multiconv_Model')
types_conv_model.output_shape

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


(None, 5)

In [85]:
types_conv_model.compile(optimizer='adam', loss='mse')

In [86]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

In [90]:
for e in range(10):
    for i in range(10):

        print("EPOCH " + str(e+1) + ", PHASE " + str(i+1))

        temp_types = pd.read_csv('word_types3_shorter_part_' + str(i) + '.csv')

        types_X = one_hot_encode_col(list(temp_types['word']))
        types_y = np.array(temp_types[temp_types.columns.difference(['word'])])

        types_X_train, types_X_test, types_y_train, types_y_test = train_test_split(types_X, types_y, test_size=0.1, random_state=random_state)

        types_conv_model.fit(types_X_train, types_y_train, callbacks=[early_stopping], epochs=1, batch_size=300, validation_split=0.05)

        types_conv_model.evaluate(types_X_test, types_y_test)

        del temp_types
        del types_X
        del types_y
        del types_X_train
        del types_X_test
        del types_y_train
        del types_y_test

EPOCH 1, PHASE 1
Train on 70149 samples, validate on 3693 samples
EPOCH 1, PHASE 2
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 3
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 4
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 5
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 6
Train on 70149 samples, validate on 3693 samples
EPOCH 1, PHASE 7
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 8
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 9
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 10
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 1
Train on 70149 samples, validate on 3693 samples
EPOCH 2, PHASE 2
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 3
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 4
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 5
Train on 70150 samples, validate on 3693 samples
EPOCH 2, 

KeyboardInterrupt: 

In [91]:
types_conv_model.save('types_conv_model2.h5')

### Need: Categorical Crossentropy

In [105]:
types_conv_model.compile(optimizer='adam', loss='categorical_crossentropy')

In [106]:
for e in range(7):
    for i in range(10):

        print("EPOCH " + str(e+1) + ", PHASE " + str(i+1))

        temp_types = pd.read_csv('word_types3_shorter_part_' + str(i) + '.csv')

        types_X = one_hot_encode_col(list(temp_types['word']))
        types_y = np.array(temp_types[temp_types.columns.difference(['word'])])

        types_X_train, types_X_test, types_y_train, types_y_test = train_test_split(types_X, types_y, test_size=0.1, random_state=random_state)

        types_conv_model.fit(types_X_train, types_y_train, callbacks=[early_stopping], epochs=1, batch_size=300, validation_split=0.05)

        types_conv_model.evaluate(types_X_test, types_y_test)

        del temp_types
        del types_X
        del types_y
        del types_X_train
        del types_X_test
        del types_y_train
        del types_y_test

EPOCH 1, PHASE 1
Train on 70149 samples, validate on 3693 samples
EPOCH 1, PHASE 2
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 3
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 4
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 5
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 6
Train on 70149 samples, validate on 3693 samples
EPOCH 1, PHASE 7
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 8
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 9
Train on 70150 samples, validate on 3693 samples
EPOCH 1, PHASE 10
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 1
Train on 70149 samples, validate on 3693 samples
EPOCH 2, PHASE 2
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 3
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 4
Train on 70150 samples, validate on 3693 samples
EPOCH 2, PHASE 5
Train on 70150 samples, validate on 3693 samples
EPOCH 2, 

EPOCH 4, PHASE 6
Train on 70149 samples, validate on 3693 samples
EPOCH 4, PHASE 7
Train on 70150 samples, validate on 3693 samples
EPOCH 4, PHASE 8
Train on 70150 samples, validate on 3693 samples
EPOCH 4, PHASE 9
Train on 70150 samples, validate on 3693 samples
EPOCH 4, PHASE 10
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 1
Train on 70149 samples, validate on 3693 samples
EPOCH 5, PHASE 2
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 3
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 4
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 5
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 6
Train on 70149 samples, validate on 3693 samples
EPOCH 5, PHASE 7
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 8
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 9
Train on 70150 samples, validate on 3693 samples
EPOCH 5, PHASE 10
Train on 70150 samples, validate on 3693 samples
EPOCH 6,

In [107]:
types_conv_model.save('types_conv_model3.h5')

In [108]:
def predict_word_type(w):
    nn_input = one_hot_encode_col([w])
    print(types_conv_model.predict(nn_input))

In [None]:
# order: adjective, determiner, noun, num, verb

In [133]:
predict_word_type('autó')

[[1.6696820e-01 1.0946930e-03 8.1266320e-01 5.8295025e-04 1.8690931e-02]]


In [119]:
'degenerált' in nums_list

False

In [123]:
temp_types = pd.read_csv('word_types3_shorter_part_9.csv')

In [130]:
temp_types[:30]

Unnamed: 0,noun_freq,verb_freq,adjective_freq,determiner_freq,num_freq,word
0,1.0,0.0,0.0,0.0,0.0,szintfelmérésünk
1,0.0,0.5,0.5,0.0,0.0,töltött
2,1.0,0.0,0.0,0.0,0.0,alkotóeszköz
3,1.0,0.0,0.0,0.0,0.0,licitrendszert
4,1.0,0.0,0.0,0.0,0.0,telepkutatást
5,0.0,0.0,1.0,0.0,0.0,halálsápadtnak
6,0.0,1.0,0.0,0.0,0.0,pördülni
7,1.0,0.0,0.0,0.0,0.0,novella-válogatásban
8,1.0,0.0,0.0,0.0,0.0,felvételeibe
9,0.0,1.0,0.0,0.0,0.0,ültethetném


In [126]:
types_y = np.array(temp_types[temp_types.columns.difference(['word'])])

In [131]:
temp_types.columns.difference(['word'])

Index(['adjective_freq', 'determiner_freq', 'noun_freq', 'num_freq',
       'verb_freq'],
      dtype='object')

In [129]:
types_y[:30]

array([[0. , 0. , 1. , 0. , 0. ],
       [0.5, 0. , 0. , 0. , 0.5],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [1. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [1. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 1. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 1. ],
       [1. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. ],
       [1. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0. , 0