In [1]:
import spacy
from spacy.tokens import Doc
from conllu import parse
import os
import unicodedata

In [2]:
nlp = spacy.load("grc_proiel_sm")


# Load ConllU data

In [3]:
SOURCES = ["../assets/INCEpTION_Conllu/"]

In [4]:
data = ""
for source in SOURCES:
    for root, dirs, files in os.walk(source):
        for file in files:
            if file.endswith(".conllu"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data += f.read()

In [8]:
NORM = "NFD"
data = unicodedata.normalize(NORM, data)

In [9]:
sentences = parse(data)

In [10]:
sentences[0][9].keys()

dict_keys(['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'])

In [11]:
sentences[0][8]['misc']

{'SpaceAfter': 'No'}

In [12]:
# get all tokens from first sentence
tokens = [t['form'] for t in sentences[0]]
tokens

['Ὑπὸ',
 'δὲ',
 'τὸ',
 'ὑπόζωμα',
 'κεῖται',
 'ἡ',
 'κοιλία',
 'τοῖς',
 'ζῴοις',
 ',',
 'τοῖς',
 'μὲν',
 'ἔχουσιν',
 'οἰσοφάγον',
 'ᾗ',
 'τελευτᾷ',
 'τοῦτο',
 'τὸ',
 'μόριον',
 ',',
 'τοῖς',
 'δὲ',
 'μὴ',
 'ἔχουσιν',
 'εὐθὺς',
 'πρὸς',
 'τῷ',
 'στόματι',
 '·']

In [13]:
sentences[0][0]['upos']

'ADP'

In [14]:
docs = []
for s in sentences:
    words = [t['form'] for t in s]
    # spaces are always True unless in t['misc'] and ['SpaceAfter'] is 'No'
    spaces = [True for t in s]
    for i, t in enumerate(s):
        if t['misc'] and t['misc']['SpaceAfter'] == 'No':

            spaces[i] = False
    doc = Doc(nlp.vocab, words=[t['form'] for t in s], spaces=spaces)
    # add tags to doc
    for i, t in enumerate(s):
        if t['upos'] != None:
            doc[i].tag_ = t['upos']
        if t['xpos'] != None:
            doc[i].pos_ = t['xpos']
        if t['lemma'] != None:
            doc[i].lemma_ = t['lemma']

    docs.append(doc)

In [15]:
# print first sentence with POS
print(docs[0].text)

Ὑπὸ δὲ τὸ ὑπόζωμα κεῖται ἡ κοιλία τοῖς ζῴοις, τοῖς μὲν ἔχουσιν οἰσοφάγον ᾗ τελευτᾷ τοῦτο τὸ μόριον, τοῖς δὲ μὴ ἔχουσιν εὐθὺς πρὸς τῷ στόματι· 


In [16]:
# get tokens of first sentence
tokens = [t for t in docs[0]]
for t in tokens:
    print(t.text, t.pos_, t.tag_, t.lemma_, t.whitespace_)

Ὑπὸ  ADP ὑπό  
δὲ  PART δὲ  
τὸ  DET ὁ  
ὑπόζωμα  NOUN ὑπόζωμα  
κεῖται  VERB κεῖμαι  
ἡ  DET ὁ  
κοιλία  NOUN κοιλία  
τοῖς  DET ὁ  
ζῴοις  NOUN ζῷον 
,  PUNCT _  
τοῖς  DET ὁ  
μὲν  PART μὲν  
ἔχουσιν  VERB ἔχω  
οἰσοφάγον  NOUN οἰσοφάγος  
ᾗ  CCONJ ᾗ  
τελευτᾷ  VERB τελευτάω  
τοῦτο  PRON οὗτος  
τὸ  DET ὁ  
μόριον  NOUN μόριον 
,  PUNCT _  
τοῖς  DET ὁ  
δὲ  PART δὲ  
μὴ  ADV μὴ  
ἔχουσιν  VERB ἔχω  
εὐθὺς  ADV εὐθύς  
πρὸς  ADP πρός  
τῷ  DET ὁ  
στόματι  NOUN στόμα 
·  PUNCT _  


# Save docs to binary file

In [None]:
from spacy.tokens import DocBin

In [None]:
doc_bin = DocBin(docs=docs)

In [61]:
doc_bin.to_disk("../assets/INCEpTION_POS.spacy")