In [1]:
import spacy
from spacy.tokens import Doc
from spacy.tokens import DocBin
from conllu import parse
import os
import unicodedata
import string

In [2]:
nlp = spacy.load("grc_proiel_trf")


# Load ConllU data

In [3]:
SOURCES = ["../assets/INCEpTION_Conllu/"]

In [4]:
data = ""
for source in SOURCES:
    for root, dirs, files in os.walk(source):
        for file in files:
            if file.endswith(".conllu"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data += f.read()
                    #print("Read", os.path.join(root, file))
                    #print("Length of data:", len(data))

In [5]:
NORM_list = ['NFKD', 'NFKC']
for NORM in NORM_list:
    data = unicodedata.normalize(NORM, data)
    sentences = parse(data)

    docs = []
    for s in sentences:
        words = [t['form'] for t in s]
        # spaces are always True unless in t['misc'] and ['SpaceAfter'] is 'No'
        spaces = [True for t in s]
        for i, t in enumerate(s):
            if t['misc'] and t['misc']['SpaceAfter'] == 'No':
                spaces[i] = False
        doc = Doc(nlp.vocab, words=[t['form'] for t in s], spaces=spaces)
        # add tags to doc
        for i, t in enumerate(s):
            if t['upos'] != None:
                # if t['form'] is a punctuation mark, t['upos'] is 'PUNCT'
                if t['form'] in string.punctuation:
                    doc[i].pos_ = 'PUNCT'
                else:
                    doc[i].pos_ = '' if t['upos'] == '_' else t['upos']
            if t['xpos'] != None:
                doc[i].tag_ = t['xpos']
            if t['lemma'] != None:
                doc[i].lemma_ = '' if t['lemma'] == '_' else t['lemma']
        docs.append(doc)
        
        
    # split docs to train, dev, test randomly
    from sklearn.model_selection import train_test_split
    from pathlib import Path

    # split docs to train, dev, test randomly, for each normalization
    train_docs_norm, test_docs_norm = train_test_split(docs, test_size=0.2, random_state=42)
    train_docs_norm, dev_docs_norm = train_test_split(train_docs_norm, test_size=0.2, random_state=42)
    
    #print count of docs and characters in each set
    print ("{0}\n".format(NORM) + f"train: {len(train_docs_norm)} ({len(''.join([doc.text_with_ws for doc in train_docs_norm]))} characters)\ndev: {len(dev_docs_norm)} ({len(''.join([doc.text_with_ws for doc in dev_docs_norm]))} characters)\ntest: {len(test_docs_norm)} ({len(''.join([doc.text_with_ws for doc in test_docs_norm]))} characters)")

    Path("../corpus/train/pos_train").mkdir(parents=True, exist_ok=True)
    Path("../corpus/dev/pos_dev").mkdir(parents=True, exist_ok=True)
    Path("../corpus/test/pos_test").mkdir(parents=True, exist_ok=True)
    
    train_bin_norm = DocBin(docs=train_docs_norm)
    train_bin_norm.to_disk("../corpus/train/pos_train/pos_train_" + "{0}.spacy".format(NORM))
    test_bin_norm = DocBin(docs=test_docs_norm)
    test_bin_norm.to_disk("../corpus/test/pos_test/pos_test_" + "{0}.spacy".format(NORM))
    dev_bin_norm = DocBin(docs=dev_docs_norm)
    dev_bin_norm.to_disk("../corpus/dev/pos_dev/pos_dev_" + "{0}.spacy".format(NORM))

NFKD
train: 107 (13380 characters)
dev: 27 (4163 characters)
test: 34 (4553 characters)
NFKC
train: 107 (10952 characters)
dev: 27 (3413 characters)
test: 34 (3736 characters)


In [6]:
NORM = "NFKD"
data = unicodedata.normalize(NORM, data)

In [7]:
sentences = parse(data)

In [8]:
sentences[0][9].keys()

dict_keys(['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'])

In [9]:
sentences[0][8]['misc']

In [10]:
# get all tokens from first sentence
tokens = [t['form'] for t in sentences[0]]
tokens

['Αἱ',
 'δ',
 '’',
 'ὑστέραι',
 'τῶν',
 'ἐχόντων',
 'ὑστέρας',
 'ζῴων',
 'οὔτε',
 'τὸν',
 'αὐτὸν',
 'τρόπον',
 'ἔχουσιν',
 'οὔθ',
 '’',
 'ὅμοιαι',
 'πάντων',
 'εἰσίν',
 ',',
 'ἀλλὰ',
 'διαφέρουσι',
 'καὶ',
 'τῶν',
 'ζῳοτοκούντων',
 'πρὸς',
 'ἄλληλα',
 'καὶ',
 'τῶν',
 'ᾠοτοκούντων',
 '.']

In [11]:
import string

docs = []
for s in sentences:
    words = [t['form'] for t in s]
    # spaces are always True unless in t['misc'] and ['SpaceAfter'] is 'No'
    spaces = [True for t in s]
    for i, t in enumerate(s):
        if t['misc'] and t['misc']['SpaceAfter'] == 'No':
            spaces[i] = False
    doc = Doc(nlp.vocab, words=[t['form'] for t in s], spaces=spaces)
    # add tags to doc
    for i, t in enumerate(s):
        if t['upos'] != None:
            # if t['form'] is a punctuation mark, t['upos'] is 'PUNCT'
            if t['form'] in string.punctuation:
                doc[i].pos_ = 'PUNCT'
            else:
                doc[i].pos_ = '' if t['upos'] == '_' else t['upos']
        if t['xpos'] != None:
            doc[i].tag_ = t['xpos']
        if t['lemma'] != None:
            doc[i].lemma_ = '' if t['lemma'] == '_' else t['lemma']
    docs.append(doc)

In [12]:
# print first sentence with POS
print(docs[0].text)

Αἱ δ’ ὑστέραι τῶν ἐχόντων ὑστέρας ζῴων οὔτε τὸν αὐτὸν τρόπον ἔχουσιν οὔθ’ ὅμοιαι πάντων εἰσίν, ἀλλὰ διαφέρουσι καὶ τῶν ζῳοτοκούντων πρὸς ἄλληλα καὶ τῶν ᾠοτοκούντων. 


In [13]:
# get tokens of first sentence
tokens = [t for t in docs[0]]
for t in tokens:
    print(t.text, t.pos_, t.tag_, t.lemma_, t.whitespace_)

Αἱ DET  ὁ  
δ CCONJ  δέ 
’     
ὑστέραι NOUN  ὑστέρα  
τῶν DET  ὁ  
ἐχόντων VERB  ἔχω  
ὑστέρας NOUN  ὑστέρα  
ζῴων NOUN  ζῷον  
οὔτε CCONJ  οὔτε  
τὸν DET  ὁ  
αὐτὸν PRON  αὐτός  
τρόπον NOUN  τρόπος  
ἔχουσιν VERB  ἔχω  
οὔθ CCONJ  οὔθ 
’     
ὅμοιαι ADJ  ὅμοιος  
πάντων ADJ  πᾶς  
εἰσίν VERB  εἰμί 
, PUNCT    
ἀλλὰ CCONJ  ἀλλὰ  
διαφέρουσι VERB  διαφέρω  
καὶ CCONJ  καί  
τῶν DET  ὁ  
ζῳοτοκούντων VERB  ζᾠοτοκέω  
πρὸς ADP  πρός  
ἄλληλα PRON  ἀλλήλων  
καὶ CCONJ  καί  
τῶν DET  ὁ  
ᾠοτοκούντων VERB  ᾠοτοκέω 
. PUNCT    


# Save docs to binary file

In [14]:
# split docs to train, dev, test randomly
from sklearn.model_selection import train_test_split
from pathlib import Path

# split docs to train, dev, test randomly, for each normalization



In [15]:
train_docs_nfkd, test_docs_nfkd = train_test_split(docs, test_size=0.2, random_state=42)
train_docs_nfkd, dev_docs_nfkd = train_test_split(train_docs_nfkd, test_size=0.2, random_state=42)

#train_docs_nfkc, test_docs_nfkc = train_test_split(docs_nfkc, test_size=0.2, random_state=42)
#train_docs_nfkc, dev_docs_nfkc = train_test_split(train_docs_nfkc, test_size=0.2, random_state=42)

print (f"train: {len(train_docs_nfkd)}\ndev: {len(dev_docs_nfkd)}\ntest: {len(test_docs_nfkd)} for nfkd")
#print (f"train: {len(train_docs_nfkc)}\ndev: {len(dev_docs_nfkc)}\ntest: {len(test_docs_nfkc)} for nfkc")
# save each one to DocBin

train: 107
dev: 27
test: 34 for nfkd


In [16]:
from spacy.tokens import DocBin

In [17]:
doc_bin = DocBin(docs=docs)

In [18]:
doc_bin.to_disk("../corpus/INCEpTION_POS_NFKD.spacy")

## Check Spacy docbin file

In [19]:
# load spacy object
# load docs from file
docs = DocBin().from_disk("../corpus//train/pos_train/pos_train_NFKD.spacy")
test_docbin_docs = list(docs.get_docs(nlp.vocab))


In [20]:
import pandas as pd

# create list of rows
rows = []
for doc in test_docbin_docs:
    for token in doc:
        row = [token.orth_, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.orth_]
        rows.append(row)

# create dataframe
df = pd.DataFrame(rows, columns=["Orth", "Lemma", "POS", "Tag", "Dep", "Head"])

# print dataframe
print(df)

              Orth    Lemma    POS Tag Dep          Head
0         Ἔχουσι    ἔχω   VERB              Ἔχουσι
1             γὰρ     γάρ  CCONJ                  γὰρ
2              οἱ       ὁ    DET                   οἱ
3          νεφροὶ  νεφρός   NOUN               νεφροὶ
4              ἐν      ἐν    ADP                   ἐν
...            ...      ...    ...  ..  ..           ...
2139    δίκραιός                            δίκραιός
2140         ἐστι                                 ἐστι
2141      Τοιῇδε                              Τοιῇδε
2142  δικραιότητε            NOUN          δικραιότητε
2143             ·                                     ·

[2144 rows x 6 columns]


In [21]:
df

Unnamed: 0,Orth,Lemma,POS,Tag,Dep,Head
0,Ἔχουσι,ἔχω,VERB,,,Ἔχουσι
1,γὰρ,γάρ,CCONJ,,,γὰρ
2,οἱ,ὁ,DET,,,οἱ
3,νεφροὶ,νεφρός,NOUN,,,νεφροὶ
4,ἐν,ἐν,ADP,,,ἐν
...,...,...,...,...,...,...
2139,δίκραιός,,,,,δίκραιός
2140,ἐστι,,,,,ἐστι
2141,Τοιῇδε,,,,,Τοιῇδε
2142,δικραιότητε,,NOUN,,,δικραιότητε


In [22]:
for token in test_docbin_docs[0]:
#print attributes
    print('text: ', token.text, 'lemma :', token.lemma_, 'POS: ', token.pos_, 'tag: ', token.tag_, 'DEP: ', token.dep_)

text:  Ἔχουσι lemma : ἔχω POS:  VERB tag:   DEP:  
text:  γὰρ lemma : γάρ POS:  CCONJ tag:   DEP:  
text:  οἱ lemma : ὁ POS:  DET tag:   DEP:  
text:  νεφροὶ lemma : νεφρός POS:  NOUN tag:   DEP:  
text:  ἐν lemma : ἐν POS:  ADP tag:   DEP:  
text:  μέσῳ lemma : μέσος POS:  ADJ tag:   DEP:  
text:  κοῖλον lemma : κοῖλον POS:  NOUN tag:   DEP:  
text:  , lemma :  POS:  PUNCT tag:   DEP:  
text:  οἱ lemma : ὁ POS:  DET tag:   DEP:  
text:  μὲν lemma : μέν POS:  CCONJ tag:   DEP:  
text:  μεῖζον lemma : μέγας POS:  ADJ tag:   DEP:  
text:  οἱ lemma : ὁ POS:  DET tag:   DEP:  
text:  δ lemma : δέ POS:  CCONJ tag:   DEP:  
text:  ’ lemma :  POS:   tag:   DEP:  
text:  ἔλαττον lemma : ἐλάσσων POS:  ADJ tag:   DEP:  
text:  , lemma :  POS:  PUNCT tag:   DEP:  
text:  πλὴν lemma : πλήν POS:  ADP tag:   DEP:  
text:  οἱ lemma : ὁ POS:  DET tag:   DEP:  
text:  τῆς lemma : ὁ POS:  DET tag:   DEP:  
text:  φώκης lemma : φώκη POS:  NOUN tag:   DEP:  
text:  · 

In [23]:
# print the first doc in spacy docbin docs
print(test_docbin_docs[0])

Ἔχουσι γὰρ οἱ νεφροὶ ἐν μέσῳ κοῖλον, οἱ μὲν μεῖζον οἱ δ’ ἔλαττον, πλὴν οἱ τῆς φώκης· 


In [24]:
# print docs in new docbin
for doc in new_docbin.get_docs(nlp.vocab):
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

NameError: name 'new_docbin' is not defined