In [1]:
import pandas as pd
import gzip
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

In [82]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
with gzip.open('morph_df.pkl.gz', 'rb') as f:
    morph_df = pickle.load(f)

In [3]:
morph_df

Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number,lemma_tok,form_tok
0,2,ἅλλομαι,ἅλεται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 8, 23, 4, 12]"
1,2,ἅλλομαι,ἅληται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 10, 23, 4, 12]"
2,2,ἅλλομαι,ἅλῃ,1,5,2,0,0,2,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 95]"
3,2,ἅλλομαι,ἅλλεσθε,4,2,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
4,2,ἅλλομαι,ἅλλεσθε,7,1,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
...,...,...,...,...,...,...,...,...,...,...,...,...
329771,0,ζῳώδης,ζῳῶδες,0,0,0,2,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329772,0,ζῳώδης,ζῳῶδες,0,0,0,1,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329773,0,ζῳώδης,ζῳῶδες,0,0,0,3,4,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329774,0,ζῳώδης,ζῳῶδες,0,0,0,3,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Lambda
from tensorflow.keras.models import Model




In [5]:
# creo un vocabolario dei caratteri
all_characters = set()

def collect_unique_chars(word):
    for char in word:
        all_characters.add(char)

morph_df["lemma"].apply(collect_unique_chars)
morph_df["form"].apply(collect_unique_chars)

char_vocab = (sorted(list(all_characters)))
char_vocab

['ά',
 'έ',
 'ή',
 'ί',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'ζ',
 'η',
 'θ',
 'ι',
 'κ',
 'λ',
 'μ',
 'ν',
 'ξ',
 'ο',
 'π',
 'ρ',
 'ς',
 'σ',
 'τ',
 'υ',
 'φ',
 'χ',
 'ψ',
 'ω',
 'ϊ',
 'ϋ',
 'ό',
 'ύ',
 'ώ',
 'ἀ',
 'ἁ',
 'ἄ',
 'ἅ',
 'ἆ',
 'ἇ',
 'ἐ',
 'ἑ',
 'ἔ',
 'ἕ',
 'ἠ',
 'ἡ',
 'ἤ',
 'ἥ',
 'ἦ',
 'ἧ',
 'ἰ',
 'ἱ',
 'ἴ',
 'ἵ',
 'ἶ',
 'ἷ',
 'ὀ',
 'ὁ',
 'ὄ',
 'ὅ',
 'ὐ',
 'ὑ',
 'ὔ',
 'ὕ',
 'ὖ',
 'ὗ',
 'ὠ',
 'ὡ',
 'ὤ',
 'ὥ',
 'ὦ',
 'ὧ',
 'ὶ',
 'ᾀ',
 'ᾄ',
 'ᾅ',
 'ᾆ',
 'ᾇ',
 'ᾐ',
 'ᾑ',
 'ᾔ',
 'ᾕ',
 'ᾖ',
 'ᾗ',
 'ᾠ',
 'ᾡ',
 'ᾤ',
 'ᾦ',
 'ᾧ',
 'ᾰ',
 'ᾱ',
 'ᾳ',
 'ᾴ',
 'ᾶ',
 'ᾷ',
 'ῃ',
 'ῄ',
 'ῆ',
 'ῇ',
 'ῐ',
 'ΐ',
 'ῖ',
 'ῠ',
 'ῥ',
 'ῦ',
 'ῳ',
 'ῴ',
 'ῶ',
 'ῷ',
 '’']

In [6]:
# creo un dizionario che assegna ad ogni carattere un indice
char_to_idx = {char: idx for idx, char in enumerate(char_vocab)}

vocab_chars = len(char_to_idx)

In [7]:
morph_df.columns

Index(['POS', 'lemma', 'form', 'tense', 'mode', 'act/mid/p', 'gender', 'case',
       'person', 'number', 'lemma_tok', 'form_tok'],
      dtype='object')

In [8]:

char_emb_dim = 64
max_len = 14
input_layers = []
# input and embeddings for lemma's characters
lemma_input = Input(shape=(14,), name = "lemma_input")
input_layers.append(lemma_input)
lemma_emb = Embedding(input_dim = vocab_chars, output_dim = char_emb_dim, name = "lemma_emb")(lemma_input)

# #input and embeddings for form's characters
# form_input = Input(shape=(None,), name = "form_input")
# form_emb = Embedding(input_dim = vocab_chars, output_dim = char_emb_dim, name = "form_emb")(form_input)


# for the features the embeddings will be repeated for every character
# input and embeddings for POS
pos_input = Input(shape = (1,), name = "pos_input")
input_layers.append(pos_input)
vocab_pos = len(morph_df['POS'].unique())
pos_emb_dim = 8
pos_emb = Embedding(input_dim = vocab_pos, output_dim = pos_emb_dim, name = "pos_emb")(pos_input)
pos_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(pos_emb)

# input and embeddings for tense
tense_input = Input(shape = (1,), name = "tense_input")
input_layers.append(tense_input)
vocab_tense = len(morph_df['tense'].unique())
tense_emb_dim = 16
tense_emb = Embedding(input_dim = vocab_tense, output_dim = tense_emb_dim, name = "tense_emb")(tense_input)
tense_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(tense_emb)

# input and embedding for mode 
mode_input = Input(shape = (1,), name = "mode_input")
input_layers.append(mode_input)
vocab_mode = len(morph_df['mode'].unique())
mode_emb_dim = 16
mode_emb = Embedding(input_dim = vocab_mode, output_dim = mode_emb_dim, name = "mode_emb")(mode_input)
mode_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(mode_emb)

# input and embedding for diathesis
diath_input = Input(shape = (1,), name = "diath_input")
input_layers.append(diath_input)
vocab_diath = len(morph_df['act/mid/p'].unique())
diath_emb_dim = 8
diath_emb = Embedding(input_dim = vocab_diath, output_dim = diath_emb_dim, name = "diath_emb")(diath_input)
diath_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(diath_emb)

# input and embedding for gender
gender_input = Input(shape = (1,), name = "gender_input")
input_layers.append(gender_input)
vocab_gender = len(morph_df['gender'].unique())
gender_emb_dim = 8
gender_emb = Embedding(input_dim = vocab_gender, output_dim = gender_emb_dim, name = "gender_emb")(gender_input)
gender_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(gender_emb)

# input and embedding for case
case_input = Input(shape = (1,), name = "case_input")
input_layers.append(case_input)
vocab_case = len(morph_df['case'].unique())
case_emb_dim = 16
case_emb = Embedding(input_dim = vocab_case, output_dim = case_emb_dim, name = "case_emb")(case_input)
case_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(case_emb)

# input and embedding for person
person_input = Input(shape = (1,), name = "person_input")
input_layers.append(person_input)
vocab_person = len(morph_df['person'].unique())
person_emb_dim = 8
person_emb = Embedding(input_dim = vocab_person, output_dim = person_emb_dim, name = "person_emb")(person_input)
person_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(person_emb)

# input and embedding for number
number_input = Input(shape = (1,), name = "number_input")
input_layers.append(number_input)
vocab_number = len(morph_df['number'].unique())
number_emb_dim = 8
number_emb = Embedding(input_dim = vocab_number, output_dim = number_emb_dim, name = "number_emb")(number_input)
number_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(number_emb)







In [9]:
print(f"Shape of lemma_emb: {lemma_emb.shape}")
print(f"Shape of pos_emb_repeated: {pos_emb_rep.shape}")
print(f"Shape of tense_emb_repeated: {tense_emb_rep.shape}")
print(f"Shape of mode_emb_repeated: {mode_emb_rep.shape}")
print(f"Shape of diath_emb: {diath_emb_rep.shape}")
print(f"Shape of gender_emb_repeated: {gender_emb_rep.shape}")
print(f"Shape of case_emb_repeated: {case_emb_rep.shape}")
print(f"Shape of person_emb_repeated: {person_emb_rep.shape}")
print(f"Shape of number_emb: {number_emb_rep.shape}")


Shape of lemma_emb: (None, 14, 64)
Shape of pos_emb_repeated: (None, 14, 8)
Shape of tense_emb_repeated: (None, 14, 16)
Shape of mode_emb_repeated: (None, 14, 16)
Shape of diath_emb: (None, 14, 8)
Shape of gender_emb_repeated: (None, 14, 8)
Shape of case_emb_repeated: (None, 14, 16)
Shape of person_emb_repeated: (None, 14, 8)
Shape of number_emb: (None, 14, 8)


In [10]:
# we now concatenate all the embeddings:

combined_embedding = Concatenate(axis = -1)([
    lemma_emb,  # character embeddings for lemma
    pos_emb_rep,  # POS embedding
    tense_emb_rep,  # tense embedding for verbs/participles
    mode_emb_rep,  # mode embedding for verbs/participles
    diath_emb_rep,  # diathesis for verbs/participles
    gender_emb_rep,  # gender for nouns/participles
    case_emb_rep,  # case for nouns/participles
    person_emb_rep,  # person for verbs
    number_emb_rep, # number for verbs/nouns/participles
])

In [11]:
# we now add lstm and dense layer

lstm_output = LSTM(128, return_sequences=True)(combined_embedding)

output = Dense(vocab_chars, activation = "softmax")(lstm_output)

In [40]:
model = Model(inputs = input_layers, outputs = output)
model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 pos_input (InputLayer)      [(None, 1)]                  0         []                            
                                                                                                  
 tense_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 mode_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 diath_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                            

In [12]:
# now we split the dataframe into training, validation and test set
train_df, temp_df = train_test_split(morph_df, test_size=0.3, stratify=morph_df['POS'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['POS'], random_state=42)

In [13]:
# we try to determine the best max_len to pad the inflected forms and lemmata
# first we understand the distribution of lemmata and inflected forms' length

morph_df['lemmata_length'] = morph_df["lemma_tok"].apply(len)
morph_df['forms_length'] = morph_df["form_tok"].apply(len)

max_len_lemmata = int(morph_df['lemmata_length'].quantile(0.95))
print(f"Max length covering 95% of lemmata: {max_len_lemmata}")
max_len_forms = int(morph_df['forms_length'].quantile(0.95))
print(f"Max length covering 95% of forms: {max_len_forms}")



Max length covering 95% of lemmata: 12
Max length covering 95% of forms: 14


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:

# we now create arrays for all the df_train columns
pos_train = np.array(train_df["POS"]).reshape(-1, 1)
lemma_train = pad_sequences(np.array(train_df["lemma_tok"]), maxlen= max_len, padding = 'post', truncating = 'pre')
tense_train = np.array(train_df["tense"]).reshape(-1, 1)
mode_train = np.array(train_df["mode"]).reshape(-1, 1)
diath_train = np.array(train_df["act/mid/p"]).reshape(-1, 1)
gender_train = np.array(train_df["gender"]).reshape(-1, 1)
case_train = np.array(train_df["case"]).reshape(-1, 1)
person_train = np.array(train_df["person"]).reshape(-1, 1)
number_train = np.array(train_df["number"]).reshape(-1, 1)

# then we create the labels
y_train = pad_sequences(np.array(train_df["form_tok"]), maxlen = max_len, padding = 'post', truncating = 'pre')


In [16]:
# now we do the same for the validation set
pos_val = np.array(val_df["POS"]).reshape(-1, 1)
lemma_val = pad_sequences(np.array(val_df["lemma_tok"]), maxlen= max_len, padding = 'post', truncating = 'pre')
tense_val = np.array(val_df["tense"]).reshape(-1, 1)
mode_val = np.array(val_df["mode"]).reshape(-1, 1)
diath_val = np.array(val_df["act/mid/p"]).reshape(-1, 1)
gender_val = np.array(val_df["gender"]).reshape(-1, 1)
case_val = np.array(val_df["case"]).reshape(-1, 1)
person_val = np.array(val_df["person"]).reshape(-1, 1)
number_val = np.array(val_df["number"]).reshape(-1, 1)

y_val = pad_sequences(np.array(val_df["form_tok"]), maxlen = max_len, padding = 'post', truncating = 'pre')

In [17]:
X_train = {
    'pos_input': pos_train,
    'lemma_input': lemma_train,
    'tense_input': tense_train,
    'mode_input': mode_train,
    'diath_input': diath_train,
    'gender_input': gender_train,
    'case_input': case_train,
    'person_input': person_train,
    'number_input': number_train
}

In [18]:
X_val = {
    'pos_input': pos_val,
    'lemma_input': lemma_val,
    'tense_input': tense_val,
    'mode_input': mode_val,
    'diath_input': diath_val,
    'gender_input': gender_val,
    'case_input': case_val,
    'person_input': person_val,
    'number_input': number_val
}

In [24]:
history = model.fit(X_train, y_train, epochs = 5, validation_data = (X_val, y_val), batch_size = 32, verbose = 1)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
from keras.callbacks import EarlyStopping

In [20]:
early_stopping = EarlyStopping(monitor="val_loss", patience = 5, verbose = 1, mode = "min", restore_best_weights=True)

In [23]:
history = model.fit(X_train, y_train, 
                    epochs = 20, 
                    validation_data = (X_val, y_val), 
                    batch_size = 32, 
                    verbose = 1,
                    callbacks = [early_stopping])

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
from sklearn.metrics import classification_report

In [27]:
y_pred = model.predict(X_val)



In [22]:
idx_to_char = {index: char for char, index in char_to_idx.items()}

In [23]:
def decode_seq(seq, idx_to_char):
    return "".join([idx_to_char.get(index, "") for index in seq])

In [24]:
def decode_pred(y_pred, idx_to_char):
    y_pred_idx = np.argmax(y_pred, axis = -1)
    # we now eliminate the padding
    # pad_token = 0
    # y_pred_idx_no_pad = [char for seq in y_pred_idx for char in seq if char != pad_token]
    return [decode_seq(seq, idx_to_char) for seq in y_pred_idx]

In [25]:
y_true_decoded = [decode_seq(seq, idx_to_char) for seq in y_val]
# y_pred_decoded = decode_pred(y_pred, idx_to_char)


In [26]:
y_true_chars = [char for seq in y_true_decoded for char in seq]
# y_pred_chars = [char for seq in y_pred_decoded for char in seq]

In [37]:
print(classification_report(y_true_chars, y_pred_chars))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ά       0.96      0.97      0.96    860603
           έ       0.81      0.78      0.79     36835
           ή       0.75      0.72      0.74     15358
           ί       0.82      0.81      0.82     31885
           α       0.82      0.84      0.83    152202
           β       0.92      0.91      0.92     13170
           γ       0.89      0.88      0.88     27088
           δ       0.91      0.92      0.91     30238
           ε       0.81      0.81      0.81    127631
           ζ       0.92      0.92      0.92     11421
           η       0.79      0.70      0.74     33765
           θ       0.87      0.81      0.84     24968
           ι       0.82      0.83      0.82    122062
           κ       0.90      0.90      0.90     62215
           λ       0.90      0.91      0.90     50805
           μ       0.91      0.91      0.91     75617
           ν       0.87      0.87      0.87    167920
           ξ       0.87    

  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
for i in range(15):
    print(f"true : {y_true_decoded[i]}  predicted: {y_pred_decoded[i]}")

true : ἁρμοστοῦάάάάάά  predicted: ἁρμοστοῦάάάάάά
true : μειλίχιεάάάάάά  predicted: μειλίχιεάάάάάά
true : ἐνιδόντεςάάάάά  predicted: ἐνδόντεςάάάάάά
true : προαπέλαυονάάά  predicted: προαπέλαυονάάά
true : φερομέναάάάάάά  predicted: φερομέναάάάάάά
true : θνητοίάάάάάάάά  predicted: θνητοίάάάάάάάά
true : ἐπαχθεῖσινάάάά  predicted: ἐπαχθήῖινάάάάά
true : ἀνοιδέειάάάάάά  predicted: ἀνοιδεῖάάάάάάά
true : διακεφαλαιοῦνά  predicted: διακεφαλαιοῦνά
true : ἐπαναβᾶσαιάάάά  predicted: ἐπαναβάσαιάάάά
true : ᾐσχύγκειάάάάάά  predicted: ᾔιθδυκειάάάάάά
true : ἀπογεγραμμέναά  predicted: ἀπογεγραμμέναά
true : γήιναάάάάάάάάά  predicted: γήίναάάάάάάάάά
true : καταφυσῶνάάάάά  predicted: καταφυσῶνάάάάά
true : προσελομένοιςά  predicted: προσγιομένοιςά


In [38]:
from sklearn.metrics import accuracy_score

# Calculate sequence-level accuracy.
sequence_accuracy = accuracy_score(y_true_decoded, y_pred_decoded)
print(f'Sequence-level Accuracy: {sequence_accuracy:.2f}')

Sequence-level Accuracy: 0.46


In [27]:
from tensorflow.keras.layers import Dropout

In [55]:
# we now create a model with two lstm layers and a Dropout layer
lstm_input = LSTM(128, return_sequences=True)(combined_embedding)

dropout = Dropout(0.2)(lstm_input)

lstm_output = LSTM(64, return_sequences=True)(dropout)

second_output = Dense(vocab_chars, activation = "softmax")(lstm_output)

In [56]:
second_model = Model(inputs = input_layers, outputs = second_output)
second_model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])
second_model.summary()



Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 pos_input (InputLayer)      [(None, 1)]                  0         []                            
                                                                                                  
 tense_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 mode_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 diath_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                            

In [57]:
history = second_model.fit(X_train, y_train, 
                    epochs = 100, 
                    validation_data = (X_val, y_val), 
                    batch_size = 32, 
                    verbose = 1,
                    callbacks = [early_stopping])

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 42: early stopping


In [58]:
second_y_pred = second_model.predict(X_val)



In [61]:
second_y_pred

array([[[5.82880348e-05, 4.85383509e-07, 1.02278580e-06, ...,
         9.04996341e-07, 4.69977431e-06, 4.28635906e-03],
        [3.85297790e-05, 3.64853477e-05, 2.29586349e-05, ...,
         8.42665850e-06, 1.72788521e-06, 8.43912931e-05],
        [1.54008158e-05, 1.86385605e-05, 2.39327728e-05, ...,
         3.60017730e-05, 1.44081832e-06, 1.38114165e-05],
        ...,
        [9.99955654e-01, 6.30950467e-07, 3.73427014e-07, ...,
         1.58731375e-08, 5.40381073e-08, 5.91598052e-07],
        [9.99985814e-01, 1.40754054e-07, 1.35468014e-07, ...,
         1.10358767e-09, 1.12942953e-08, 2.02389742e-07],
        [9.99992371e-01, 5.33752704e-08, 7.87581627e-08, ...,
         2.29026034e-10, 3.39627193e-09, 9.11897615e-08]],

       [[2.61087971e-05, 9.13921951e-07, 2.24265114e-06, ...,
         2.94311440e-06, 1.08078826e-08, 5.38074564e-05],
        [3.69079612e-06, 6.08376833e-03, 7.90619306e-05, ...,
         5.46143310e-05, 3.54110693e-06, 4.66764868e-05],
        [9.06730293e-06, 

In [77]:
second_y_pred_decoded = decode_pred(second_y_pred, idx_to_char)

In [78]:
second_y_pred_chars = [char for seq in second_y_pred_decoded for char in seq]

In [79]:
print(classification_report(y_true_chars, second_y_pred_chars))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ά       0.96      0.98      0.97    860603
           έ       0.84      0.79      0.81     36835
           ή       0.76      0.79      0.78     15358
           ί       0.84      0.83      0.83     31885
           α       0.83      0.85      0.84    152202
           β       0.93      0.91      0.92     13170
           γ       0.89      0.89      0.89     27088
           δ       0.91      0.92      0.91     30238
           ε       0.82      0.83      0.82    127631
           ζ       0.90      0.94      0.92     11421
           η       0.79      0.75      0.77     33765
           θ       0.88      0.82      0.85     24968
           ι       0.85      0.82      0.83    122062
           κ       0.90      0.91      0.90     62215
           λ       0.91      0.91      0.91     50805
           μ       0.92      0.92      0.92     75617
           ν       0.90      0.86      0.88    167920
           ξ       0.89    

  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
from sklearn.metrics import accuracy_score

# Calculate sequence-level accuracy.
sequence_accuracy = accuracy_score(y_true_decoded, second_y_pred_decoded)
print(f'Sequence-level Accuracy: {sequence_accuracy:.2f}')

Sequence-level Accuracy: 0.50


In [81]:
with open("results.txt", "w", encoding = "utf-8") as outfile:
    for i in range(len(y_true_decoded)):
        outfile.write(f"true:  {y_true_decoded[i]} predicted:  {second_y_pred_decoded[i]}\n")

In [28]:
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import Bidirectional

In [29]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
adam = Adam(learning_rate=0.001)
callbacks = [reduce_lr, early_stopping]

In [31]:
# we now create a model with two lstm layers and a Dropout layer
lstm_input = LSTM(128, return_sequences=True)(combined_embedding)

dropout = Dropout(0.2)(lstm_input)

bi_lstm_output = Bidirectional(LSTM(64, return_sequences=True))(dropout)

third_output = Dense(vocab_chars, activation = "softmax")(lstm_output)

In [33]:
third_model = Model(inputs = input_layers, outputs = third_output)
third_model.compile(optimizer = adam, loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])
third_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 pos_input (InputLayer)      [(None, 1)]                  0         []                            
                                                                                                  
 tense_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 mode_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 diath_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                              

                                                                                                  
 number_input (InputLayer)   [(None, 1)]                  0         []                            
                                                                                                  
 lemma_input (InputLayer)    [(None, 14)]                 0         []                            
                                                                                                  
 pos_emb (Embedding)         (None, 1, 8)                 24        ['pos_input[0][0]']           
                                                                                                  
 tense_emb (Embedding)       (None, 1, 16)                128       ['tense_input[0][0]']         
                                                                                                  
 mode_emb (Embedding)        (None, 1, 16)                96        ['mode_input[0][0]']          
          

In [35]:
history = third_model.fit(X_train, y_train, 
                    epochs = 100, 
                    validation_data = (X_val, y_val), 
                    batch_size = 32, 
                    verbose = 1,
                    callbacks = callbacks)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


In [36]:
third_y_pred = third_model.predict(X_val)



In [38]:
third_y_pred_decoded = decode_pred(third_y_pred, idx_to_char)
third_y_pred_chars = [char for seq in third_y_pred_decoded for char in seq]

In [39]:
print(classification_report(y_true_chars, third_y_pred_chars))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ά       0.96      0.97      0.97    860603
           έ       0.82      0.79      0.81     36835
           ή       0.76      0.76      0.76     15358
           ί       0.83      0.82      0.83     31885
           α       0.83      0.85      0.84    152202
           β       0.92      0.91      0.92     13170
           γ       0.90      0.89      0.90     27088
           δ       0.91      0.92      0.92     30238
           ε       0.81      0.82      0.82    127631
           ζ       0.92      0.93      0.92     11421
           η       0.79      0.73      0.76     33765
           θ       0.88      0.82      0.85     24968
           ι       0.83      0.83      0.83    122062
           κ       0.90      0.91      0.90     62215
           λ       0.91      0.91      0.91     50805
           μ       0.92      0.92      0.92     75617
           ν       0.89      0.87      0.88    167920
           ξ       0.90    

  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
from sklearn.metrics import accuracy_score

# Calculate sequence-level accuracy.
sequence_accuracy = accuracy_score(y_true_decoded, third_y_pred_decoded)
print(f'Sequence-level Accuracy: {sequence_accuracy:.2f}')

Sequence-level Accuracy: 0.49


In [41]:
with open("third_model_results.txt", "w", encoding = "utf-8") as outfile:
    for i in range(len(y_true_decoded)):
        outfile.write(f"true:  {y_true_decoded[i]} predicted:  {third_y_pred_decoded[i]}\n")