In [34]:
import pandas as pd
import gzip
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
with gzip.open('morph_df.pkl.gz', 'rb') as f:
    morph_df = pickle.load(f)

In [14]:
morph_df

Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number,lemma_tok,form_tok
0,2,ἅλλομαι,ἅλεται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 8, 23, 4, 12]"
1,2,ἅλλομαι,ἅληται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 10, 23, 4, 12]"
2,2,ἅλλομαι,ἅλῃ,1,5,2,0,0,2,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 95]"
3,2,ἅλλομαι,ἅλλεσθε,4,2,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
4,2,ἅλλομαι,ἅλλεσθε,7,1,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
...,...,...,...,...,...,...,...,...,...,...,...,...
329771,0,ζῳώδης,ζῳῶδες,0,0,0,2,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329772,0,ζῳώδης,ζῳῶδες,0,0,0,1,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329773,0,ζῳώδης,ζῳῶδες,0,0,0,3,4,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329774,0,ζῳώδης,ζῳῶδες,0,0,0,3,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"


In [15]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model

In [16]:
# creo un vocabolario dei caratteri
all_characters = set()

def collect_unique_chars(word):
    for char in word:
        all_characters.add(char)

morph_df["lemma"].apply(collect_unique_chars)
morph_df["form"].apply(collect_unique_chars)

char_vocab = (sorted(list(all_characters)))
char_vocab

['ά',
 'έ',
 'ή',
 'ί',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'ζ',
 'η',
 'θ',
 'ι',
 'κ',
 'λ',
 'μ',
 'ν',
 'ξ',
 'ο',
 'π',
 'ρ',
 'ς',
 'σ',
 'τ',
 'υ',
 'φ',
 'χ',
 'ψ',
 'ω',
 'ϊ',
 'ϋ',
 'ό',
 'ύ',
 'ώ',
 'ἀ',
 'ἁ',
 'ἄ',
 'ἅ',
 'ἆ',
 'ἇ',
 'ἐ',
 'ἑ',
 'ἔ',
 'ἕ',
 'ἠ',
 'ἡ',
 'ἤ',
 'ἥ',
 'ἦ',
 'ἧ',
 'ἰ',
 'ἱ',
 'ἴ',
 'ἵ',
 'ἶ',
 'ἷ',
 'ὀ',
 'ὁ',
 'ὄ',
 'ὅ',
 'ὐ',
 'ὑ',
 'ὔ',
 'ὕ',
 'ὖ',
 'ὗ',
 'ὠ',
 'ὡ',
 'ὤ',
 'ὥ',
 'ὦ',
 'ὧ',
 'ὶ',
 'ᾀ',
 'ᾄ',
 'ᾅ',
 'ᾆ',
 'ᾇ',
 'ᾐ',
 'ᾑ',
 'ᾔ',
 'ᾕ',
 'ᾖ',
 'ᾗ',
 'ᾠ',
 'ᾡ',
 'ᾤ',
 'ᾦ',
 'ᾧ',
 'ᾰ',
 'ᾱ',
 'ᾳ',
 'ᾴ',
 'ᾶ',
 'ᾷ',
 'ῃ',
 'ῄ',
 'ῆ',
 'ῇ',
 'ῐ',
 'ΐ',
 'ῖ',
 'ῠ',
 'ῥ',
 'ῦ',
 'ῳ',
 'ῴ',
 'ῶ',
 'ῷ',
 '’']

In [17]:
# creo un dizionario che assegna ad ogni carattere un indice
char_to_idx = {char: idx for idx, char in enumerate(char_vocab)}

vocab_chars = len(char_to_idx)

In [18]:
morph_df.columns

Index(['POS', 'lemma', 'form', 'tense', 'mode', 'act/mid/p', 'gender', 'case',
       'person', 'number', 'lemma_tok', 'form_tok'],
      dtype='object')

In [25]:
char_emb_dim = 64
input_layers = []
# input and embeddings for lemma's characters
lemma_input = Input(shape=(None,), name = "lemma_input")
input_layers.append(lemma_input)
lemma_emb = Embedding(input_dim = vocab_chars, output_dim = char_emb_dim, name = "lemma_emb")(lemma_input)

#input and embeddings for form's characters
form_input = Input(shape=(None,), name = "form_input")
input_layers.append(form_input)
form_emb = Embedding(input_dim = vocab_chars, output_dim = char_emb_dim, name = "form_emb")(form_input)


# input and embeddings for POS
pos_input = Input(shape = (1,), name = "pos_input")
input_layers.append(pos_input)
vocab_pos = len(morph_df['POS'].unique())
pos_emb_dim = 8
pos_emb = Embedding(input_dim = vocab_pos, output_dim = pos_emb_dim, name = "pos_emb")(pos_input)

# input and embeddings for tense
tense_input = Input(shape = (1,), name = "tense_input")
input_layers.append(tense_input)
vocab_tense = len(morph_df['tense'].unique())
tense_emb_dim = 16
tense_emb = Embedding(input_dim = vocab_tense, output_dim = tense_emb_dim, name = "tense_emb")(tense_input)

# input and embedding for mode 
mode_input = Input(shape = (1,), name = "mode_input")
input_layers.append(mode_input)
vocab_mode = len(morph_df['mode'].unique())
mode_emb_dim = 16
mode_emb = Embedding(input_dim = vocab_mode, output_dim = mode_emb_dim, name = "mode_emb")(mode_input)

# input and embedding for diathesis
diath_input = Input(shape = (1,), name = "diath_input")
input_layers.append(diath_input)
vocab_diath = len(morph_df['act/mid/p'].unique())
diath_emb_dim = 8
diath_emb = Embedding(input_dim = vocab_diath, output_dim = diath_emb_dim, name = "diath_emb")(diath_input)

# input and embedding for gender
gender_input = Input(shape = (1,), name = "gender_input")
input_layers.append(gender_input)
vocab_gender = len(morph_df['gender'].unique())
gender_emb_dim = 8
gender_emb = Embedding(input_dim = vocab_gender, output_dim = gender_emb_dim, name = "gender_emb")(gender_input)

# input and embedding for case
case_input = Input(shape = (1,), name = "case_input")
input_layers.append(case_input)
vocab_case = len(morph_df['case'].unique())
case_emb_dim = 16
case_emb = Embedding(input_dim = vocab_case, output_dim = case_emb_dim, name = "case_emb")(case_input)

# input and embedding for person
person_input = Input(shape = (1,), name = "person_input")
input_layers.append(person_input)
vocab_person = len(morph_df['person'].unique())
person_emb_dim = 8
person_emb = Embedding(input_dim = vocab_person, output_dim = person_emb_dim, name = "person_emb")(person_input)

# input and embedding for number
number_input = Input(shape = (1,), name = "number_input")
input_layers.append(number_input)
vocab_number = len(morph_df['number'].unique())
number_emb_dim = 8
number_emb = Embedding(input_dim = vocab_number, output_dim = number_emb_dim, name = "number_emb")(number_input)




In [27]:
# we now concatenate all the embeddings:

combined_embedding = Concatenate()([
    lemma_emb,  # character embeddings for lemma
    form_emb, # character embeddings for form
    pos_emb,  # POS embedding
    tense_emb,  # tense embedding for verbs/participles
    mode_emb,  # mode embedding for verbs/participles
    diath_emb,  # diathesis for verbs/participles
    gender_emb,  # gender for nouns/participles
    case_emb,  # case for nouns/participles
    person_emb,  # person for verbs
    number_emb, # number for verbs/nouns/participles
])

In [28]:
# we now add lstm and dense layer

lstm_output = LSTM(128, return_sequences=True)(combined_embedding)

output = Dense(vocab_chars, activation = "softmax")(lstm_output)

In [29]:
model = Model(inputs = input_layers, outputs = output)
model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lemma_input (InputLayer)    [(None, None)]               0         []                            
                                                                                                  
 form_input (InputLayer)     [(None, None)]               0         []                            
                                                                                                  
 pos_input (InputLayer)      [(None, 1)]                  0         []                            
                                                                                                  
 tense_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                           

In [32]:
# now we split the dataframe into training, validation and test set
train_df, temp_df = train_test_split(morph_df, test_size=0.3, stratify=morph_df['POS'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['POS'], random_state=42)

In [33]:
morph_df.columns

Index(['POS', 'lemma', 'form', 'tense', 'mode', 'act/mid/p', 'gender', 'case',
       'person', 'number', 'lemma_tok', 'form_tok'],
      dtype='object')

In [37]:
# we now create arrays for all the df_train columns
pos_train = np.array(train_df["POS"])
lemma_train = np.array(train_df["lemma_tok"])
tense_train = np.array(train_df["tense"])
mode_train = np.array(train_df["mode"])
diath_train = np.array(train_df["act/mid/p"])
gender_train = np.array(train_df["gender"])
case_train = np.array(train_df["case"])
person_train = np.array(train_df["person"])
number_train = np.array(train_df["number"])

In [39]:
x_train = {
    'pos_input': pos_train,
    'lemma_input': lemma_train,
    'tense_input': tense_train,
    'mode_input': mode_train,
    'diath_input': diath_train,
    'gender_input': gender_train,
    'case_input': case_train,
    'person_input': person_train,
    'number_input': number_train
}

y_train = np.array(train_df["form_tok"])