In [7]:
from conllu import parse

with open('data/hungarian_ext.conllu', 'r', encoding='utf8') as file:
    data = file.read()

sentences = parse(data)

import spacy
from spacy.tokens import Doc, Token
from conllu import parse_incr
from conllu import parse
from spacy.vocab import Vocab
from spacy import displacy
from pprint import pprint

nlp = spacy.blank("fi")  
vocab = nlp.vocab

if not Token.has_extension("xpos"):
    Token.set_extension("xpos", default=None)
if not Token.has_extension("deps"):
    Token.set_extension("deps", default=None)
if not Token.has_extension("misc"):
    Token.set_extension("misc", default=None)


def process_conllu_file(file_path):
    docs = []
    with open(file_path, "r", encoding="utf-8") as file:
        for conllu_sentence in parse_incr(file):
            doc = conllu_to_spacy_doc(conllu_sentence, vocab)
            docs.append(doc)
    return docs

def conllu_to_spacy_doc(conllu_sentence, vocab):
    words = [token["form"] for token in conllu_sentence]
    heads = [token["head"] if token["head"] is not None else 0 for token in conllu_sentence]  # Head = 0 para raíces
    deps = [token["deprel"].split(":")[0] for token in conllu_sentence]
    lemmas = [token["lemma"] for token in conllu_sentence]
    pos_tags = [token["upostag"] for token in conllu_sentence]
    xpos_tags = [token["xpos"] for token in conllu_sentence]
    morph_feats = [token["feats"] or {} for token in conllu_sentence]
    misc_info = [token["misc"] or {} for token in conllu_sentence]
    deps_info = [token["deps"] or {} for token in conllu_sentence]

    # Crear el objeto Doc
    doc = Doc(vocab, words=words)

    # Asignar los atributos a cada token
    for i, token in enumerate(doc):
        token.lemma_ = lemmas[i]
        
        # Asignar el Universal POS a `pos_`, o usar `tag_` si no es válido
        if pos_tags[i] != "_":
            token.pos_ = pos_tags[i]
        elif xpos_tags[i] is not None:  # Verificar si xpos_tags[i] no es None antes de asignarlo
            token.tag_ = xpos_tags[i]  # Usar `xpos` como `tag_` si `upos` es "_"

        token.dep_ = deps[i]
        
        # Verificar que heads[i] sea un número entero antes de asignarlo como cabeza
        if isinstance(heads[i], int) and heads[i] > 0:
            token.head = doc[heads[i] - 1]  # Usar head como índice (spaCy usa índice 0 para la raíz)
        else:
            token.head = token  # Si head es 0 o no válido, el token se apunta a sí mismo como raíz

        # Asignar xpos, deps y misc como atributos personalizados
        token._.xpos = xpos_tags[i]
        token._.deps = deps_info[i]
        token._.misc = misc_info[i]

        # Convertir `feats` a un formato que spaCy entiende y asignarlo a morph
        if morph_feats[i]:
            feats_str = "|".join(f"{k}={v}" for k, v in morph_feats[i].items())
            token.set_morph(feats_str)

    return doc

def render_doc_by_sent_id(docs, sentences, target_sent_id):
    # Buscar el índice del `sent_id` deseado en los metadatos de `sentences`
    for i, sentence in enumerate(sentences):
        if sentence.metadata.get("sent_id") == target_sent_id:
            # Renderizar el Doc correspondiente usando `displacy`
            displacy.render(docs[i], style="dep", jupyter=True)
            return  # Salimos después de encontrar y renderizar
    print(f"Oración con sent_id '{target_sent_id}' no encontrada.")


file_path = "data/hungarian_ext.conllu"
docs = process_conllu_file(file_path)


In [2]:
render_doc_by_sent_id(docs, sentences, '1')
# I go into the house.

print(sentences[0])

for token in sentences[0]:
    print(token.items())

TokenList<Bemegyek, a, házba, ., metadata={generator: "UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe", udpipe_model: "hungarian-szeged-ud-2.12-230717", udpipe_model_licence: "CC BY-NC-SA", newdoc: "None", newpar: "None", sent_id: "1", text: "Bemegyek a házba."}>
dict_items([('id', 1), ('form', 'Bemegyek'), ('lemma', 'bemegy'), ('upos', 'VERB'), ('xpos', None), ('feats', {'Definite': 'Ind', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', {'TokenRange': '0:8'})])
dict_items([('id', 2), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 3), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '9:10'})])
dict_items([('id', 3), ('form', 'házba'), ('lemma', 'ház'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Ill', 'Number': 'Sing'}), ('head', 1), ('deprel', 'obl'), ('deps', None), ('misc

In [3]:
render_doc_by_sent_id(docs, sentences, '2')
# I put the book into the wardrobe.


print(sentences[1])

for token in sentences[1]:
    print(token.items())

TokenList<A, könyvet, beleteszem, a, szekrénybe, ., metadata={sent_id: "2", text: "A könyvet beleteszem a szekrénybe."}>
dict_items([('id', 1), ('form', 'A'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '18:19'})])
dict_items([('id', 2), ('form', 'könyvet'), ('lemma', 'könyv'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Acc', 'Number': 'Sing'}), ('head', 3), ('deprel', 'obj'), ('deps', None), ('misc', {'TokenRange': '20:27'})])
dict_items([('id', 3), ('form', 'beleteszem'), ('lemma', 'beletesz'), ('upos', 'VERB'), ('xpos', None), ('feats', {'Definite': 'Def', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', {'TokenRange': '28:38'})])
dict_items([('id', 4), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite'

In [4]:
render_doc_by_sent_id(docs, sentences, '3')
# The car parks in the garage.


print(sentences[2])

for token in sentences[2]:
    print(token.items())

TokenList<Az, autó, beparkol, a, garázsba, ., metadata={sent_id: "3", text: "Az autó beparkol a garázsba."}>
dict_items([('id', 1), ('form', 'Az'), ('lemma', 'az'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '53:55'})])
dict_items([('id', 2), ('form', 'autó'), ('lemma', 'autó'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Sing'}), ('head', 3), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '56:60'})])
dict_items([('id', 3), ('form', 'beparkol'), ('lemma', 'be+parkol'), ('upos', 'VERB'), ('xpos', None), ('feats', {'Definite': 'Ind', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', {'TokenRange': '61:69'})])
dict_items([('id', 4), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'Pro

In [None]:
render_doc_by_sent_id(docs, sentences, '4')

# The children go into the playground.
# Utiliza sublative.

print(sentences[3])

for token in sentences[3]:
    print(token.items())

TokenList<A, gyerekek, bemennek, a, játszótérre, ., metadata={sent_id: "4", text: "A gyerekek bemennek a játszótérre."}>
dict_items([('id', 1), ('form', 'A'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '82:83'})])
dict_items([('id', 2), ('form', 'gyerekek'), ('lemma', 'gyerek'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Plur'}), ('head', 3), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '84:92'})])
dict_items([('id', 3), ('form', 'bemennek'), ('lemma', 'bemen'), ('upos', 'VERB'), ('xpos', None), ('feats', {'Definite': 'Ind', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', {'TokenRange': '93:101'})])
dict_items([('id', 4), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite'

In [None]:
render_doc_by_sent_id(docs, sentences, '5')
# The dog runs into the garden.

print(sentences[4])

for token in sentences[4]:
    print(token.items())

TokenList<A, kutya, beszalad, a, kertbe, ., metadata={sent_id: "5", text: "A kutya beszalad a kertbe."}>
dict_items([('id', 1), ('form', 'A'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '117:118'})])
dict_items([('id', 2), ('form', 'kutya'), ('lemma', 'kutya'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Sing'}), ('head', 3), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '119:124'})])
dict_items([('id', 3), ('form', 'beszalad'), ('lemma', 'beszalad'), ('upos', 'VERB'), ('xpos', None), ('feats', {'Definite': 'Ind', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', {'TokenRange': '125:133'})])
dict_items([('id', 4), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'Pr

In [8]:
render_doc_by_sent_id(docs, sentences, '6')
# The book is at the table.


print(sentences[5])

for token in sentences[5]:
    print(token.items())

TokenList<A, könyv, az, asztalnál, van, ., metadata={sent_id: "6", text: "A könyv az asztalnál van."}>
dict_items([('id', 1), ('form', 'A'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '144:145'})])
dict_items([('id', 2), ('form', 'könyv'), ('lemma', 'könyv'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Sing'}), ('head', 5), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '146:151'})])
dict_items([('id', 3), ('form', 'az'), ('lemma', 'az'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '152:154'})])
dict_items([('id', 4), ('form', 'asztalnál'), ('lemma', 'asztal'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Ade', 'Number': 'Sing'}), ('head', 5), ('deprel', 'obl'), ('deps', None), ('misc', {'TokenRange'

In [9]:
render_doc_by_sent_id(docs, sentences, '7')
# The children are playing at the playground.



print(sentences[6])

for token in sentences[6]:
    print(token.items())

TokenList<A, gyerekek, a, játszótérnél, játszanak, ., metadata={sent_id: "7", text: "A gyerekek a játszótérnél játszanak."}>
dict_items([('id', 1), ('form', 'A'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '170:171'})])
dict_items([('id', 2), ('form', 'gyerekek'), ('lemma', 'gyerek'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Plur'}), ('head', 5), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '172:180'})])
dict_items([('id', 3), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '181:182'})])
dict_items([('id', 4), ('form', 'játszótérnél'), ('lemma', 'játszótér'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Ade', 'Number': 'Sing'}), ('head', 5), ('deprel', 'obl'), ('deps',

In [11]:
render_doc_by_sent_id(docs, sentences, '8')
# The dog is waiting by the gate.

# Incorrect

print(sentences[7])

for token in sentences[7]:
    print(token.items())

TokenList<A, kutya, a, kapunál, vár, ., metadata={sent_id: "8", text: "A kutya a kapunál vár."}>
dict_items([('id', 1), ('form', 'A'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '207:208'})])
dict_items([('id', 2), ('form', 'kutya'), ('lemma', 'kutya'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Sing'}), ('head', 5), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '209:214'})])
dict_items([('id', 3), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '215:216'})])
dict_items([('id', 4), ('form', 'kapunál'), ('lemma', 'kapunál'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Sing'}), ('head', 5), ('deprel', 'obl'), ('deps', None), ('misc', {'TokenRange': '217:22

In [13]:
render_doc_by_sent_id(docs, sentences, '9')
# We’ll meet at the school.

# Incorrect

print(sentences[8])

for token in sentences[8]:
    print(token.items())

TokenList<Találkozunk, az, iskolánál, ., metadata={sent_id: "9", text: "Találkozunk az iskolánál."}>
dict_items([('id', 1), ('form', 'Találkozunk'), ('lemma', 'Találkozunk'), ('upos', 'VERB'), ('xpos', None), ('feats', {'Definite': 'Ind', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', {'TokenRange': '230:241'})])
dict_items([('id', 2), ('form', 'az'), ('lemma', 'az'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 3), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '242:244'})])
dict_items([('id', 3), ('form', 'iskolánál'), ('lemma', 'iskola'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Ade', 'Number': 'Sing'}), ('head', 1), ('deprel', 'obl'), ('deps', None), ('misc', {'SpaceAfter': 'No', 'TokenRange': '245:254'})])
dict_items([('id', 4), ('form', '.'), ('lemma', '.'), ('upos', 'PUNCT'), ('xpos', None), ('fe

In [14]:
render_doc_by_sent_id(docs, sentences, '10')
# The car is parked by the house.

# Incorrect

print(sentences[9])

for token in sentences[9]:
    print(token.items())

TokenList<Az, autó, a, háznál, parkol, ., metadata={sent_id: "10", text: "Az autó a háznál parkol."}>
dict_items([('id', 1), ('form', 'Az'), ('lemma', 'az'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 2), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '256:258'})])
dict_items([('id', 2), ('form', 'autó'), ('lemma', 'autó'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Nom', 'Number': 'Sing'}), ('head', 5), ('deprel', 'nsubj'), ('deps', None), ('misc', {'TokenRange': '259:263'})])
dict_items([('id', 3), ('form', 'a'), ('lemma', 'a'), ('upos', 'DET'), ('xpos', None), ('feats', {'Definite': 'Def', 'PronType': 'Art'}), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', {'TokenRange': '264:265'})])
dict_items([('id', 4), ('form', 'háznál'), ('lemma', 'ház'), ('upos', 'NOUN'), ('xpos', None), ('feats', {'Case': 'Ade', 'Number': 'Sing'}), ('head', 5), ('deprel', 'obl'), ('deps', None), ('misc', {'TokenRange': '266:27