In [1]:
import gzip
import json
import pandas as pd
from my_functions_improved import *

  backends.update(_get_backends("networkx.backends"))
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‎𐤀 CLTK version '1.3.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekSpacyProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``GreekSpacyProcess`` using OdyCy model by Center for Humanities Computing Aarhus from https://huggingface.co/chcaa . Please cite: https://aclanthology.org/2023.latechclfl-1.14
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [2]:
with gzip.open('noun_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    noun_dict = json.load(gzip_file)

In [3]:
with gzip.open('verb_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    verb_dict = json.load(gzip_file)

In [4]:
noun_df = pd.DataFrame(noun_dict)

In [5]:
verb_df = pd.DataFrame(verb_dict)

In [6]:
noun_df

Unnamed: 0,lemma,form,gender,case,number,dialects
0,ἅβρα,ἅβρα,fem,nom/voc/acc,dual,
1,ἅβρα,ἅβρα,fem,nom/voc,sg,"[(attic, doric, aeolic]"
2,ἅβρα,ἅβραι,fem,nom/voc,pl,
3,ἅβρα,ἅβραι,fem,dat,sg,"[(attic, doric, aeolic]"
4,ἅβρα,ἅβραις,fem,dat,pl,
...,...,...,...,...,...,...
216247,ἵστωρ,ῐ̔́στορες,,nom,pl,
216248,ἵστωρ,ῐ̔́στωρ,,nom,sg,
216249,ἵστωρ,ῐ̔́στορε,,voc,dual,
216250,ἵστωρ,ῐ̔́στορες,,voc,pl,


In [7]:
verb_df

Unnamed: 0,lemma,form,tense,mode,act/mid/p,gender,case,person,number,dialects
0,ἅλλομαι,ἅλεται,aor,subj,mid,,,3rd,sg,
1,ἅλλομαι,ἅληται,aor,subj,mid,,,3rd,sg,
2,ἅλλομαι,ἅλῃ,aor,subj,mid,,,2nd,sg,
3,ἅλλομαι,ἅλλεσθαι,pres,inf,mid,,,,,
4,ἅλλομαι,ἅλλεσθε,imperf,ind,mid,,,2nd,pl,"[(doric, aeolic]"
...,...,...,...,...,...,...,...,...,...,...
859327,ζῳοτροφέω,ζῳοτροφεῖν,pres,inf,act,,,,,[doric]
859328,ζῳοτροφέω,ζῳοτροφούντων,pres,part,act,masc/neut,gen,,pl,[doric]
859329,ζῳοτροφέω,ζῳοτροφούντων,pres,imperat,act,,,3rd,pl,[doric]
859330,ζῳοτροφέω,ζῳοτροφοῦσιν,pres,part,act,masc/neut,dat,,pl,[doric]


In [15]:
noun_df

Unnamed: 0,lemma,form,gender,case,number,dialects
0,ἅβρα,ἅβρα,fem,nom/voc/acc,dual,
1,ἅβρα,ἅβρα,fem,nom/voc,sg,"[(attic, doric, aeolic]"
2,ἅβρα,ἅβραι,fem,nom/voc,pl,
3,ἅβρα,ἅβραι,fem,dat,sg,"[(attic, doric, aeolic]"
4,ἅβρα,ἅβραις,fem,dat,pl,
...,...,...,...,...,...,...
216247,ἵστωρ,ῐ̔́στορες,,nom,pl,
216248,ἵστωρ,ῐ̔́στωρ,,nom,sg,
216249,ἵστωρ,ῐ̔́στορε,,voc,dual,
216250,ἵστωρ,ῐ̔́στορες,,voc,pl,


In [None]:
# Explode the 'dialects' column to convert lists into separate rows
exploded_df = noun_df.explode('dialects')

# Get the unique values of the 'dialects' column
unique_values = exploded_df['dialects'].unique()

# Print the unique values
print(unique_values)

In [18]:
noun_df['gender'].unique()

array(['fem', 'masc', 'neut', 'masc/neut', 'masc/fem', 'masc/fem/neut',
       ' '], dtype=object)

In [17]:
unique_gender = len(noun_df['gender'].unique())
unique_case = len(noun_df['case'].unique())
unique_number = len(noun_df['number'].unique())
unique_gender, unique_case, unique_number

(7, 9, 3)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Concatenate

# Parameters
lemma_vocab_size = 20000  # adjust based on your vocabulary size for lemmas
tag_vocab_size = 500     # adjust based on your total number of unique morphological tags
lemma_embedding_dim = 256
tag_embedding_dim = 50   # smaller dimension for categorical data
lstm_units = 256

# Define inputs
lemma_inputs = Input(shape=(None,))  # Input for lemmas
tag_inputs = Input(shape=(None,))    # Input for morphological tags

# Embeddings
lemma_embedding = Embedding(lemma_vocab_size, lemma_embedding_dim)(lemma_inputs)
tag_embedding = Embedding(tag_vocab_size, tag_embedding_dim)(tag_inputs)

# Combine the embeddings
combined_embeddings = Concatenate()([lemma_embedding, tag_embedding])

# Encoder LSTM
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(combined_embeddings)
encoder_states = [state_h, state_c]

# Decoder setup
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(lemma_vocab_size, lemma_embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(lemma_vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define and create the model
model = Model([lemma_inputs, tag_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
