In [None]:
import spacy
from spacy.tokens import Doc
from spacy.tokens import DocBin
from conllu import parse
import os
import unicodedata
import string

In [None]:
nlp = spacy.load("grc_proiel_trf")


# Load ConllU data

In [None]:
SOURCES = ["../assets/INCEpTION_Conllu/"]

In [None]:
data = ""
for source in SOURCES:
    for root, dirs, files in os.walk(source):
        for file in files:
            if file.endswith(".conllu"):
                # print file name
                print(file)
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data += f.read()
                    #print("Read", os.path.join(root, file))
                    #print("Length of data:", len(data))

In [None]:
NORM_list = ['NFKD', 'NFKC']
for NORM in NORM_list:
    data = unicodedata.normalize(NORM, data)
    sentences = parse(data)

    docs = []
    apostrophes = [' ̓', "᾿", "᾽", "'", "’", "‘"]  # all possible apostrophes
    
    for s in sentences:
        #words = [t['form'] for t in s]
        words = ["ʼ" if t['form'] in apostrophes else t['form'] for t in s]
        print ('words: ',words)
        # spaces are always True unless in t['misc'] and ['SpaceAfter'] is 'No'
        spaces = [True for t in s]
        for i, t in enumerate(s):
            if t['misc'] and t['misc']['SpaceAfter'] == 'No':
                spaces[i] = False
        #doc = Doc(nlp.vocab, words=[t['form'] for t in s], spaces=spaces)
        doc = Doc(nlp.vocab, words=["ʼ" if t['form'] in apostrophes else t['form'] for t in s], spaces=spaces)
        print (doc, spaces)
        # add tags to doc
        for i, t in enumerate(s):
            if t['upos'] != None:
                # if t['form'] is a punctuation mark, t['upos'] is 'PUNCT'
                if t['form'] in string.punctuation:
                    doc[i].pos_ = 'PUNCT'
                else:
                    doc[i].pos_ = '' if t['upos'] == '_' else t['upos']
            if t['xpos'] != None:
                doc[i].tag_ = t['xpos']
            if t['lemma'] != None:
                doc[i].lemma_ = '' if t['lemma'] == '_' else t['lemma']
        docs.append(doc)

    for doc in docs:
        print(f"Source file: {doc}",)
        #cleaned_sentence = str(doc)
        cleaned_sentence = ' '.join(str(doc).replace('\r', ' ').replace('\n', ' ').split())
        print(f"Cleaned sentence: {cleaned_sentence}")        
            
    # split docs to train, dev, test randomly
    from sklearn.model_selection import train_test_split
    from pathlib import Path

    # split docs to train, dev, test randomly, for each normalization
    train_docs_norm, test_docs_norm = train_test_split(docs, test_size=0.2, random_state=42)
    train_docs_norm, dev_docs_norm = train_test_split(train_docs_norm, test_size=0.2, random_state=42)
    
    #print count of docs and characters in each set
    print ("{0}\n".format(NORM) + f"train: {len(train_docs_norm)} ({len(''.join([doc.text_with_ws for doc in train_docs_norm]))} characters)\ndev: {len(dev_docs_norm)} ({len(''.join([doc.text_with_ws for doc in dev_docs_norm]))} characters)\ntest: {len(test_docs_norm)} ({len(''.join([doc.text_with_ws for doc in test_docs_norm]))} characters)")

    Path("../corpus/train/pos_train").mkdir(parents=True, exist_ok=True)
    Path("../corpus/dev/pos_dev").mkdir(parents=True, exist_ok=True)
    Path("../corpus/test/pos_test").mkdir(parents=True, exist_ok=True)
    
    train_bin_norm = DocBin(docs=train_docs_norm)
    train_bin_norm.to_disk("../corpus/train/pos_train/pos_train_" + "{0}.spacy".format(NORM))
    test_bin_norm = DocBin(docs=test_docs_norm)
    test_bin_norm.to_disk("../corpus/test/pos_test/pos_test_" + "{0}.spacy".format(NORM))
    dev_bin_norm = DocBin(docs=dev_docs_norm)
    dev_bin_norm.to_disk("../corpus/dev/pos_dev/pos_dev_" + "{0}.spacy".format(NORM))

In [None]:
for doc in docs:
    # print tokens and spaces
    print(f"Tokens: {[t.text for t in doc]}")
    print(f"Spaces: {[t.whitespace_ for t in doc]}")
    # print tags
    print(f"Tags: {[t.pos_ for t in doc]}")
    print(f"Tags: {[t.tag_ for t in doc]}")
    print(f"Tags: {[t.lemma_ for t in doc]}")
    


In [None]:
# print first sentence with POS
docs

In [None]:
# get tokens of first sentence
print(docs[24])
tokens = [t for t in docs[24]]
for t in tokens:
    print(t.text, t.pos_, t.tag_, t.lemma_, t.whitespace_)


## Check Spacy docbin file

In [None]:
# load spacy object
# load docs from file
docs = DocBin().from_disk("../corpus/train/pos_train/pos_train_NFKD.spacy")
test_docbin_docs = list(docs.get_docs(nlp.vocab))


In [None]:
for doc in test_docbin_docs:
    print (doc)

In [None]:
import pandas as pd

# create list of rows
rows = []
for doc in test_docbin_docs:
    for token in doc:
        row = [token.orth_, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.orth_]
        rows.append(row)

# create dataframe
df = pd.DataFrame(rows, columns=["Orth", "Lemma", "POS", "Tag", "Dep", "Head"])

# print dataframe
print(df)

In [None]:
df

In [None]:
for token in test_docbin_docs[0]:
#print attributes
    print('text: ', token.text, 'lemma :', token.lemma_, 'POS: ', token.pos_, 'tag: ', token.tag_, 'DEP: ', token.dep_)

In [None]:
# print the first doc in spacy docbin docs
print(test_docbin_docs)