In [1]:
# Import potrebných knižníc
import stanza
import pandas as pd
import requests
import json  # Zabezpečenie správneho importu JSON knižnice

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Načítanie textového súboru
def read_text_file(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        text = file.read()
    return text

filepath = r"C:\Users\marti\Downloads\DiploDiktaty.txt"
text = read_text_file(filepath)

# Stiahnutie slovenského modelu pre Stanza (ak je potrebné)
stanza.download('sk')
nlp = stanza.Pipeline(lang='sk')

# Spracovanie textu cez Stanza
dokument = nlp(text)

# Príprava dát na ukladanie
data = []

# Prechádzanie cez vety a spracovanie cez UDPipe API
for veta in dokument.sentences:
    try:
        # Volanie UDPipe API
        response = requests.get(
            f"http://lindat.mff.cuni.cz/services/udpipe/api/process?tokenizer&tagger&parser&data={veta.text}&model=slovak-snk-ud-2.15-241121"
        )
        if response.status_code != 200:
            print(f"Chyba API pri spracovaní vety: {veta.text}")
            continue

        vysledok = json.loads(response.text)  # Použitie JSON modulu
        vysledok_riadky = vysledok['result'].split('\n')

        # Spracovanie každého riadku výsledku
        for riadok in vysledok_riadky:
            if not riadok.startswith("#") and riadok.strip():
                riadok_data = riadok.split('\t')

                token = riadok_data[1]
                lemma = riadok_data[2]
                xpos = riadok_data[4]
                deprel = riadok_data[7]
                head = riadok_data[6]
                id_slova = riadok_data[0]

                xpos1 = xpos[0] if len(xpos) > 0 else '-'
                xpos2 = xpos[1] if len(xpos) > 1 else '-'
                xpos3 = xpos[2] if len(xpos) > 2 else '-'
                xpos4 = xpos[3] if len(xpos) > 3 else '-'
                xpos5 = xpos[4] if len(xpos) > 4 else '-'
                xpos6 = xpos[5] if len(xpos) > 5 else '-'
                xpos7 = xpos[6] if len(xpos) > 6 else '-'

                data.append([token, lemma, xpos1, xpos2, xpos3, xpos4, xpos5, xpos6, xpos7, deprel, head, id_slova])
    except Exception as e:
        print(f"Chyba pri spracovaní vety '{veta.text}': {e}")

# Vytvorenie DataFrame a uloženie do CSV
df = pd.DataFrame(data, columns=['Token', 'Lemma', 'Xpos1', 'Xpos2', 'Xpos3', 'Xpos4', 'Xpos5', 'Xpos6', 'Xpos7', 'Deprel', 'Head', 'ID'])
df.to_excel("output.xlsx", index=False, engine="openpyxl")

print("Výstup uložený do 'output.csv'.")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 5.65MB/s]                    
2025-01-12 14:12:09 INFO: Downloaded file to C:\Users\marti\stanza_resources\resources.json
2025-01-12 14:12:09 INFO: Downloading default packages for language: sk (Slovak) ...
2025-01-12 14:12:10 INFO: File exists: C:\Users\marti\stanza_resources\sk\default.zip
2025-01-12 14:12:11 INFO: Finished downloading models and saved to C:\Users\marti\stanza_resources
2025-01-12 14:12:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 12.6MB/s]                    
2025-01-12 14:12:11 INFO: Downloaded file to C:\Users\marti\stanza_resources\resources.json
2025-01-12 14:12:11 INFO: Loading thes

Výstup uložený do 'output.csv'.
