# Basic Auto Running Vocab with Classical Language Toolkit (CLTK)

Note: CLTK runs only on Linux or MacOS. See https://github.com/cltk/cltk/blob/master/notebooks/CLTK%20Demonstration.ipynb for a quick-start guide.

In [1]:
from cltk import NLP
cltk_nlp = NLP(language="grc")

  return torch._C._cuda_getDeviceCount() > 0


‎𐤀 CLTK version '1.1.3'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.


Using first paragraph of Beresford and Douglas 2.1 as an example.

In [2]:
with open("../text/Beresford.02.txt", encoding="utf-8") as f:
    beresford = f.readlines()
test = beresford[1]
print(test)

2.1.1 Ὀρφεὺς πολὺ ἐτιμήθη ἐν τοῖς πάλαι· λύραν χρυσῆν εἶχε δῶρον Ἀπόλλωνος· αἱ δὲ Μοῦσαι αὐτὸν ἐδίδαξαν τὴν λύραν ψάλλειν, καὶ οὐ διὰ μακροῦ ἐμπειρότατος ἐγένετο τῆς τέχνης, καὶ ἐς τοσοῦτον ἧκεν ἐπιστήμης ὥστε θέλγειν οὐ μόνον τὰ ζῷα ἀλλὰ καὶ τὰ δένδρα τῇ μουσικῇ. Οἱ δὲ θῆρες, ὡς λέγουσιν, ἀκούσαντες τὸ μέλος τῷ Ὀρφεῖ ἠκολούθουν.



Running the Ancient Greek pipeline and showing the `word` data type.

In [3]:
cltk_doc_grc = cltk_nlp.analyze(text=test)
print("Tokens:", cltk_doc_grc.tokens)
print("\nWord index=3:\n",cltk_doc_grc.words[3])
print("\nToken:",cltk_doc_grc.words[3].string)
print("Lemma:",cltk_doc_grc.words[3].lemma)
print("Morphology:",cltk_doc_grc.words[3].features)

Tokens: ['2.1.1', 'Ὀρφεὺς', 'πολὺ', 'ἐτιμήθη', 'ἐν', 'τοῖς', 'πάλαι·', 'λύραν', 'χρυσῆν', 'εἶχε', 'δῶρον', 'Ἀπόλλωνος·', 'αἱ', 'δὲ', 'Μοῦσαι', 'αὐτὸν', 'ἐδίδαξαν', 'τὴν', 'λύραν', 'ψάλλειν,', 'καὶ', 'οὐ', 'διὰ', 'μακροῦ', 'ἐμπειρότατος', 'ἐγένετο', 'τῆς', 'τέχνης,', 'καὶ', 'ἐς', 'τοσοῦτον', 'ἧκεν', 'ἐπιστήμης', 'ὥστε', 'θέλγειν', 'οὐ', 'μόνον', 'τὰ', 'ζῷα', 'ἀλλὰ', 'καὶ', 'τὰ', 'δένδρα', 'τῇ', 'μουσικῇ.', 'Οἱ', 'δὲ', 'θῆρες,', 'ὡς', 'λέγουσιν,', 'ἀκούσαντες', 'τὸ', 'μέλος', 'τῷ', 'Ὀρφεῖ', 'ἠκολούθουν.']

Word index=3:
 Word(index_char_start=None, index_char_stop=None, index_token=3, index_sentence=0, string='ἐτιμήθη', pos=verb, lemma='τιμάω', stem=None, scansion=None, xpos='V-', upos='VERB', dependency_relation='root', governor=-1, features={Aspect: [perfective], Mood: [indicative], Number: [singular], Person: [third], Tense: [past], VerbForm: [finite], Voice: [passive]}, category={F: [neg], N: [neg], V: [pos]}, stop=False, named_entity=None, syllables=None, phonetic_transcription=None

Putting tokens, lemmata and morphology of non-stop words into lists.

In [16]:
tokens = []
lemmata = []
morph = []
for word in cltk_doc_grc:
    if word.stop == False:
            tokens.append(word.string)
            lemmata.append(word.lemma)
            morph.append(word.features)

Using the Perseus and Logeion Greek Short Definitions to add basic glosses.

https://github.com/helmadik/shortdefs

In [37]:
import csv
with open("shortdefsGreekEnglishLogeion", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter = "\t")
    shortdefs = dict((row[0], row[1]) for row in reader)
print("Short defs:",list(shortdefs.items())[0:10],"\n")
glosses = []
for lemma in lemmata:
    if lemma in shortdefs:
        gloss = shortdefs[lemma]
        glosses.append(gloss)
    else:
        gloss = ""
        glosses.append(gloss)
rows = zip(tokens, lemmata, morph, glosses)
rows = list(rows)
for row in rows:
    m = str(row[2])
    m = m.replace("{","")
    print(m)
    #print(row)

Short defs: [('lemma', 'def'), ('ἆ', 'ah!'), ('ἃ', 'ha ha'), ('ἀάατος', 'not to be injured, inviolable'), ('ἀαγής', 'unbroken, not to be broken, hard, strong'), ('ἄαδα', 'unpleasant'), ('ἀάζω', 'breathe with the mouth wide open'), ('ἀακίδωτος', 'barbless'), ('ἄανθα', 'ear-ring'), ('ἄαπτος', 'not to be touched, resistless, invincible')] 

}
Case: [nominative], Gender: [masculine], Number: [singular]}
Case: [accusative], Degree: [positive], Gender: [neuter], Number: [singular]}
Aspect: [perfective], Mood: [indicative], Number: [singular], Person: [third], Tense: [past], VerbForm: [finite], Voice: [passive]}
Case: [dative], Gender: [masculine], Number: [plural]}
Case: [accusative], Gender: [feminine], Number: [singular]}
Case: [accusative], Gender: [feminine], Number: [singular]}
Aspect: [imperfective], Mood: [indicative], Number: [singular], Person: [third], Tense: [past], VerbForm: [finite], Voice: [active]}
Case: [accusative], Gender: [neuter], Number: [singular]}
Case: [genitive], Gen

Export to a TSV file.

In [28]:
with open("test_vocab.tsv", "w", encoding="utf-8") as g:
    writer = csv.writer(g, delimiter="\t")
    for row in rows:
        writer.writerow(row)