In [81]:
import pandas as pd
import gzip
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

In [82]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [83]:
with gzip.open('morph_df.pkl.gz', 'rb') as f:
    morph_df = pickle.load(f)

In [84]:
morph_df

Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number,lemma_tok,form_tok
0,2,ἅλλομαι,ἅλεται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 8, 23, 4, 12]"
1,2,ἅλλομαι,ἅληται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 10, 23, 4, 12]"
2,2,ἅλλομαι,ἅλῃ,1,5,2,0,0,2,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 95]"
3,2,ἅλλομαι,ἅλλεσθε,4,2,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
4,2,ἅλλομαι,ἅλλεσθε,7,1,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
...,...,...,...,...,...,...,...,...,...,...,...,...
329771,0,ζῳώδης,ζῳῶδες,0,0,0,2,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329772,0,ζῳώδης,ζῳῶδες,0,0,0,1,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329773,0,ζῳώδης,ζῳῶδες,0,0,0,3,4,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329774,0,ζῳώδης,ζῳῶδες,0,0,0,3,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"


In [85]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Lambda
from tensorflow.keras.models import Model

In [86]:
# creo un vocabolario dei caratteri
all_characters = set()

def collect_unique_chars(word):
    for char in word:
        all_characters.add(char)

morph_df["lemma"].apply(collect_unique_chars)
morph_df["form"].apply(collect_unique_chars)

char_vocab = (sorted(list(all_characters)))
char_vocab

['ά',
 'έ',
 'ή',
 'ί',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'ζ',
 'η',
 'θ',
 'ι',
 'κ',
 'λ',
 'μ',
 'ν',
 'ξ',
 'ο',
 'π',
 'ρ',
 'ς',
 'σ',
 'τ',
 'υ',
 'φ',
 'χ',
 'ψ',
 'ω',
 'ϊ',
 'ϋ',
 'ό',
 'ύ',
 'ώ',
 'ἀ',
 'ἁ',
 'ἄ',
 'ἅ',
 'ἆ',
 'ἇ',
 'ἐ',
 'ἑ',
 'ἔ',
 'ἕ',
 'ἠ',
 'ἡ',
 'ἤ',
 'ἥ',
 'ἦ',
 'ἧ',
 'ἰ',
 'ἱ',
 'ἴ',
 'ἵ',
 'ἶ',
 'ἷ',
 'ὀ',
 'ὁ',
 'ὄ',
 'ὅ',
 'ὐ',
 'ὑ',
 'ὔ',
 'ὕ',
 'ὖ',
 'ὗ',
 'ὠ',
 'ὡ',
 'ὤ',
 'ὥ',
 'ὦ',
 'ὧ',
 'ὶ',
 'ᾀ',
 'ᾄ',
 'ᾅ',
 'ᾆ',
 'ᾇ',
 'ᾐ',
 'ᾑ',
 'ᾔ',
 'ᾕ',
 'ᾖ',
 'ᾗ',
 'ᾠ',
 'ᾡ',
 'ᾤ',
 'ᾦ',
 'ᾧ',
 'ᾰ',
 'ᾱ',
 'ᾳ',
 'ᾴ',
 'ᾶ',
 'ᾷ',
 'ῃ',
 'ῄ',
 'ῆ',
 'ῇ',
 'ῐ',
 'ΐ',
 'ῖ',
 'ῠ',
 'ῥ',
 'ῦ',
 'ῳ',
 'ῴ',
 'ῶ',
 'ῷ',
 '’']

In [87]:
# creo un dizionario che assegna ad ogni carattere un indice
char_to_idx = {char: idx for idx, char in enumerate(char_vocab)}

vocab_chars = len(char_to_idx)

In [88]:
morph_df.columns

Index(['POS', 'lemma', 'form', 'tense', 'mode', 'act/mid/p', 'gender', 'case',
       'person', 'number', 'lemma_tok', 'form_tok'],
      dtype='object')

In [89]:

char_emb_dim = 64
max_len = 14
input_layers = []
# input and embeddings for lemma's characters
lemma_input = Input(shape=(14,), name = "lemma_input")
input_layers.append(lemma_input)
lemma_emb = Embedding(input_dim = vocab_chars, output_dim = char_emb_dim, name = "lemma_emb")(lemma_input)

# #input and embeddings for form's characters
# form_input = Input(shape=(None,), name = "form_input")
# form_emb = Embedding(input_dim = vocab_chars, output_dim = char_emb_dim, name = "form_emb")(form_input)


# for the features the embeddings will be repeated for every character
# input and embeddings for POS
pos_input = Input(shape = (1,), name = "pos_input")
input_layers.append(pos_input)
vocab_pos = len(morph_df['POS'].unique())
pos_emb_dim = 8
pos_emb = Embedding(input_dim = vocab_pos, output_dim = pos_emb_dim, name = "pos_emb")(pos_input)
pos_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(pos_emb)

# input and embeddings for tense
tense_input = Input(shape = (1,), name = "tense_input")
input_layers.append(tense_input)
vocab_tense = len(morph_df['tense'].unique())
tense_emb_dim = 16
tense_emb = Embedding(input_dim = vocab_tense, output_dim = tense_emb_dim, name = "tense_emb")(tense_input)
tense_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(tense_emb)

# input and embedding for mode 
mode_input = Input(shape = (1,), name = "mode_input")
input_layers.append(mode_input)
vocab_mode = len(morph_df['mode'].unique())
mode_emb_dim = 16
mode_emb = Embedding(input_dim = vocab_mode, output_dim = mode_emb_dim, name = "mode_emb")(mode_input)
mode_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(mode_emb)

# input and embedding for diathesis
diath_input = Input(shape = (1,), name = "diath_input")
input_layers.append(diath_input)
vocab_diath = len(morph_df['act/mid/p'].unique())
diath_emb_dim = 8
diath_emb = Embedding(input_dim = vocab_diath, output_dim = diath_emb_dim, name = "diath_emb")(diath_input)
diath_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(diath_emb)

# input and embedding for gender
gender_input = Input(shape = (1,), name = "gender_input")
input_layers.append(gender_input)
vocab_gender = len(morph_df['gender'].unique())
gender_emb_dim = 8
gender_emb = Embedding(input_dim = vocab_gender, output_dim = gender_emb_dim, name = "gender_emb")(gender_input)
gender_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(gender_emb)

# input and embedding for case
case_input = Input(shape = (1,), name = "case_input")
input_layers.append(case_input)
vocab_case = len(morph_df['case'].unique())
case_emb_dim = 16
case_emb = Embedding(input_dim = vocab_case, output_dim = case_emb_dim, name = "case_emb")(case_input)
case_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(case_emb)

# input and embedding for person
person_input = Input(shape = (1,), name = "person_input")
input_layers.append(person_input)
vocab_person = len(morph_df['person'].unique())
person_emb_dim = 8
person_emb = Embedding(input_dim = vocab_person, output_dim = person_emb_dim, name = "person_emb")(person_input)
person_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(person_emb)

# input and embedding for number
number_input = Input(shape = (1,), name = "number_input")
input_layers.append(number_input)
vocab_number = len(morph_df['number'].unique())
number_emb_dim = 8
number_emb = Embedding(input_dim = vocab_number, output_dim = number_emb_dim, name = "number_emb")(number_input)
number_emb_rep = Lambda(lambda x: tf.repeat(x, repeats=max_len, axis=1))(number_emb)




In [90]:
print(f"Shape of lemma_emb: {lemma_emb.shape}")
print(f"Shape of pos_emb_repeated: {pos_emb_rep.shape}")
print(f"Shape of tense_emb_repeated: {tense_emb_rep.shape}")
print(f"Shape of mode_emb_repeated: {mode_emb_rep.shape}")
print(f"Shape of diath_emb: {diath_emb_rep.shape}")
print(f"Shape of gender_emb_repeated: {gender_emb_rep.shape}")
print(f"Shape of case_emb_repeated: {case_emb_rep.shape}")
print(f"Shape of person_emb_repeated: {person_emb_rep.shape}")
print(f"Shape of number_emb: {number_emb_rep.shape}")


Shape of lemma_emb: (None, 14, 64)
Shape of pos_emb_repeated: (None, 14, 8)
Shape of tense_emb_repeated: (None, 14, 16)
Shape of mode_emb_repeated: (None, 14, 16)
Shape of diath_emb: (None, 14, 8)
Shape of gender_emb_repeated: (None, 14, 8)
Shape of case_emb_repeated: (None, 14, 16)
Shape of person_emb_repeated: (None, 14, 8)
Shape of number_emb: (None, 14, 8)


In [106]:
# we now concatenate all the embeddings:

combined_embedding = Concatenate(axis = -1)([
    lemma_emb,  # character embeddings for lemma
    pos_emb_rep,  # POS embedding
    tense_emb_rep,  # tense embedding for verbs/participles
    mode_emb_rep,  # mode embedding for verbs/participles
    diath_emb_rep,  # diathesis for verbs/participles
    gender_emb_rep,  # gender for nouns/participles
    case_emb_rep,  # case for nouns/participles
    person_emb_rep,  # person for verbs
    number_emb_rep, # number for verbs/nouns/participles
])

In [107]:
# we now add lstm and dense layer

lstm_output = LSTM(128, return_sequences=True)(combined_embedding)

output = Dense(vocab_chars, activation = "softmax")(lstm_output)

In [108]:
model = Model(inputs = input_layers, outputs = output)
model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])
model.summary()

In [109]:
# now we split the dataframe into training, validation and test set
train_df, temp_df = train_test_split(morph_df, test_size=0.3, stratify=morph_df['POS'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['POS'], random_state=42)

In [110]:
# we try to determine the best max_len to pad the inflected forms and lemmata
# first we understand the distribution of lemmata and inflected forms' length

morph_df['lemmata_length'] = morph_df["lemma_tok"].apply(len)
morph_df['forms_length'] = morph_df["form_tok"].apply(len)

max_len_lemmata = int(morph_df['lemmata_length'].quantile(0.95))
print(f"Max length covering 95% of lemmata: {max_len_lemmata}")
max_len_forms = int(morph_df['forms_length'].quantile(0.95))
print(f"Max length covering 95% of forms: {max_len_forms}")



Max length covering 95% of lemmata: 12
Max length covering 95% of forms: 14


In [111]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [112]:

# we now create arrays for all the df_train columns
pos_train = np.array(train_df["POS"]).reshape(-1, 1)
lemma_train = pad_sequences(np.array(train_df["lemma_tok"]), maxlen= max_len, padding = 'post', truncating = 'pre')
tense_train = np.array(train_df["tense"]).reshape(-1, 1)
mode_train = np.array(train_df["mode"]).reshape(-1, 1)
diath_train = np.array(train_df["act/mid/p"]).reshape(-1, 1)
gender_train = np.array(train_df["gender"]).reshape(-1, 1)
case_train = np.array(train_df["case"]).reshape(-1, 1)
person_train = np.array(train_df["person"]).reshape(-1, 1)
number_train = np.array(train_df["number"]).reshape(-1, 1)

# then we create the labels
y_train = pad_sequences(np.array(train_df["form_tok"]), maxlen = max_len, padding = 'post', truncating = 'pre')


In [113]:
# now we do the same for the validation set
pos_val = np.array(val_df["POS"]).reshape(-1, 1)
lemma_val = pad_sequences(np.array(val_df["lemma_tok"]), maxlen= max_len, padding = 'post', truncating = 'pre')
tense_val = np.array(val_df["tense"]).reshape(-1, 1)
mode_val = np.array(val_df["mode"]).reshape(-1, 1)
diath_val = np.array(val_df["act/mid/p"]).reshape(-1, 1)
gender_val = np.array(val_df["gender"]).reshape(-1, 1)
case_val = np.array(val_df["case"]).reshape(-1, 1)
person_val = np.array(val_df["person"]).reshape(-1, 1)
number_val = np.array(val_df["number"]).reshape(-1, 1)

y_val = pad_sequences(np.array(val_df["form_tok"]), maxlen = max_len, padding = 'post', truncating = 'pre')

In [114]:
X_train = {
    'pos_input': pos_train,
    'lemma_input': lemma_train,
    'tense_input': tense_train,
    'mode_input': mode_train,
    'diath_input': diath_train,
    'gender_input': gender_train,
    'case_input': case_train,
    'person_input': person_train,
    'number_input': number_train
}

In [115]:
X_val = {
    'pos_input': pos_val,
    'lemma_input': lemma_val,
    'tense_input': tense_val,
    'mode_input': mode_val,
    'diath_input': diath_val,
    'gender_input': gender_val,
    'case_input': case_val,
    'person_input': person_val,
    'number_input': number_val
}

In [116]:
model.summary()

In [118]:
from tensorflow.keras import backend as K
K.clear_session()

In [119]:
history = model.fit(X_train, y_train, epochs = 5, validation_data = (X_val, y_val), batch_size = 32, verbose = 1)

Epoch 1/5


ValueError: Exception encountered when calling Concatenate.call().

[1mDimension 1 in both shapes must be equal, but are 196 and 14. Shapes are [?,196] and [?,14]. for '{{node functional_5_1/concatenate_5_1/concat}} = ConcatV2[N=9, T=DT_FLOAT, Tidx=DT_INT32](functional_5_1/lemma_emb_1/GatherV2, functional_5_1/lambda_24_1/Repeat/Reshape_1, functional_5_1/lambda_25_1/Repeat/Reshape_1, functional_5_1/lambda_26_1/Repeat/Reshape_1, functional_5_1/lambda_27_1/Repeat/Reshape_1, functional_5_1/lambda_28_1/Repeat/Reshape_1, functional_5_1/lambda_29_1/Repeat/Reshape_1, functional_5_1/lambda_30_1/Repeat/Reshape_1, functional_5_1/lambda_31_1/Repeat/Reshape_1, functional_5_1/concatenate_5_1/concat/axis)' with input shapes: [?,1,64], [?,14,8], [?,14,16], [?,196,16], [?,14,8], [?,14,8], [?,14,16], [?,14,8], [?,14,8], [] and with computed input tensors: input[9] = <-1>.[0m

Arguments received by Concatenate.call():
  • inputs=['tf.Tensor(shape=(None, 1, 64), dtype=float32)', 'tf.Tensor(shape=(None, 14, 8), dtype=float32)', 'tf.Tensor(shape=(None, 14, 16), dtype=float32)', 'tf.Tensor(shape=(None, 196, 16), dtype=float32)', 'tf.Tensor(shape=(None, 14, 8), dtype=float32)', 'tf.Tensor(shape=(None, 14, 8), dtype=float32)', 'tf.Tensor(shape=(None, 14, 16), dtype=float32)', 'tf.Tensor(shape=(None, 14, 8), dtype=float32)', 'tf.Tensor(shape=(None, 14, 8), dtype=float32)']