In [1]:
import spacy
from spacy.tokens import Doc
from conllu import parse
import os
import unicodedata

In [2]:
nlp = spacy.load("grc_proiel_trf")


# Load ConllU data

In [3]:
SOURCES = ["../assets/INCEpTION_Conllu/"]

In [4]:
data = ""
for source in SOURCES:
    for root, dirs, files in os.walk(source):
        for file in files:
            if file.endswith(".conllu"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data += f.read()

In [5]:
NORM = "NFKD"
data = unicodedata.normalize(NORM, data)

In [6]:
sentences = parse(data)

In [7]:
sentences[0][9].keys()

dict_keys(['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'])

In [8]:
sentences[0][8]['misc']

In [9]:
# get all tokens from first sentence
tokens = [t['form'] for t in sentences[0]]
tokens

['Αἱ',
 'δ',
 '’',
 'ὑστέραι',
 'τῶν',
 'ἐχόντων',
 'ὑστέρας',
 'ζῴων',
 'οὔτε',
 'τὸν',
 'αὐτὸν',
 'τρόπον',
 'ἔχουσιν',
 'οὔθ',
 '’',
 'ὅμοιαι',
 'πάντων',
 'εἰσίν',
 ',',
 'ἀλλὰ',
 'διαφέρουσι',
 'καὶ',
 'τῶν',
 'ζῳοτοκούντων',
 'πρὸς',
 'ἄλληλα',
 'καὶ',
 'τῶν',
 'ᾠοτοκούντων',
 '.']

In [10]:
import string

docs = []
for s in sentences:
    words = [t['form'] for t in s]
    # spaces are always True unless in t['misc'] and ['SpaceAfter'] is 'No'
    spaces = [True for t in s]
    for i, t in enumerate(s):
        if t['misc'] and t['misc']['SpaceAfter'] == 'No':
            spaces[i] = False
    doc = Doc(nlp.vocab, words=[t['form'] for t in s], spaces=spaces)
    # add tags to doc
    for i, t in enumerate(s):
        if t['upos'] != None:
            # if t['form'] is a punctuation mark, t['upos'] is 'PUNCT'
            if t['form'] in string.punctuation:
                doc[i].pos_ = 'PUNCT'
            else:
                doc[i].pos_ = '' if t['upos'] == '_' else t['upos']
        if t['xpos'] != None:
            doc[i].tag_ = t['xpos']
        if t['lemma'] != None:
            doc[i].lemma_ = '' if t['lemma'] == '_' else t['lemma']
    docs.append(doc)

In [11]:
# print first sentence with POS
print(docs[0].text)

Αἱ δ’ ὑστέραι τῶν ἐχόντων ὑστέρας ζῴων οὔτε τὸν αὐτὸν τρόπον ἔχουσιν οὔθ’ ὅμοιαι πάντων εἰσίν, ἀλλὰ διαφέρουσι καὶ τῶν ζῳοτοκούντων πρὸς ἄλληλα καὶ τῶν ᾠοτοκούντων. 


In [12]:
# get tokens of first sentence
tokens = [t for t in docs[0]]
for t in tokens:
    print(t.text, t.pos_, t.tag_, t.lemma_, t.whitespace_)

Αἱ DET  ὁ  
δ CCONJ  δέ 
’     
ὑστέραι NOUN  ὑστέρα  
τῶν DET  ὁ  
ἐχόντων VERB  ἔχω  
ὑστέρας NOUN  ὑστέρα  
ζῴων NOUN  ζῷον  
οὔτε CCONJ  οὔτε  
τὸν DET  ὁ  
αὐτὸν PRON  αὐτός  
τρόπον NOUN  τρόπος  
ἔχουσιν VERB  ἔχω  
οὔθ CCONJ  οὔθ 
’     
ὅμοιαι ADJ  ὅμοιος  
πάντων ADJ  πᾶς  
εἰσίν VERB  εἰμί 
, PUNCT    
ἀλλὰ CCONJ  ἀλλὰ  
διαφέρουσι VERB  διαφέρω  
καὶ CCONJ  καί  
τῶν DET  ὁ  
ζῳοτοκούντων VERB  ζᾠοτοκέω  
πρὸς ADP  πρός  
ἄλληλα PRON  ἀλλήλων  
καὶ CCONJ  καί  
τῶν DET  ὁ  
ᾠοτοκούντων VERB  ᾠοτοκέω 
. PUNCT    


# Save docs to binary file

In [13]:
from spacy.tokens import DocBin

In [14]:
doc_bin = DocBin(docs=docs)

In [15]:
doc_bin.to_disk("../corpus/INCEpTION_POS_NFKD.spacy")

## Check Spacy docbin file

In [None]:
# load spacy object
# load docs from file
docs = DocBin().from_disk("../corpus/INCEpTION_POS.spacy")
test_docbin_docs = list(docs.get_docs(nlp.vocab))


In [None]:
import pandas as pd

# create list of rows
rows = []
for doc in test_docbin_docs:
    for token in doc:
        row = [token.orth_, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.orth_]
        rows.append(row)

# create dataframe
df = pd.DataFrame(rows, columns=["Orth", "Lemma", "POS", "Tag", "Dep", "Head"])

# print dataframe
print(df)

In [None]:
df

In [None]:
for token in test_docbin_docs[0]:
#print attributes
    print('text: ', token.text, 'lemma :', token.lemma_, 'POS: ', token.pos_, 'tag: ', token.tag_, 'DEP: ', token.dep_)

In [None]:
# print the first doc in spacy docbin docs
print(test_docbin_docs[0])

In [None]:
# print docs in new docbin
for doc in new_docbin.get_docs(nlp.vocab):
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)