In [1]:
from nltk import sent_tokenize
from nltk import word_tokenize
import requests
from xml.etree import ElementTree
import time

def tokenize_sentences(sentences, use_ufal = False):
    tokenized_sentences = []
    
    if not use_ufal:
        for index, msg in enumerate(sentences):
            sentences = []
            for sent in sent_tokenize(msg, language="czech"):
                sentence = []
                for token in word_tokenize(sent):
                    sentence.append(token)
                sentences.append(sentence)
            tokenized_sentences.append((msg, sentences))
    else: # -----------------------------------------------------------------
        for index, msg in enumerate(sentences):
            msg = msg.strip()

            try:
                files = {
                    ('data', msg),
                    ('output', 'xml'),
                    ('model', 'czech-cnec2.0-140304')
                }

                response_pos = requests.get('http://lindat.mff.cuni.cz/services/nametag/api/recognize', params=files)
                root = ElementTree.fromstring("<root>" + response_pos.json()["result"] + "</root>")

                sentences = []

                for sent in root.iter("sentence"):
                    sentence = []
                    bool_ne = False
                    for token in sent.iter():
                        if token.tag == "ne":
                            if token.attrib.get("type").startswith("a"):
                                sentence.append("adresa")
                            elif token.attrib.get("type").startswith("g"):
                                sentence.append("místo")
                            elif token.attrib.get("type").startswith("i"):
                                sentence.append("instituce")
                            elif token.attrib.get("type").startswith("m"):
                                sentence.append("soubor")
                            elif token.attrib.get("type").startswith("n"):
                                sentence.append("číslo")
                            elif token.attrib.get("type").startswith("o"):
                                sentence.append("produkt")
                            elif token.attrib.get("type").startswith("p"):
                                sentence.append("jméno")
                            elif token.attrib.get("type").startswith("t"):
                                sentence.append("čas")
                            bool_ne = True

                        elif token.tag == "token" and bool_ne == False:
                            sentence.append(token.text)
                        elif bool_ne:
                            bool_ne = False

                    sentences.append(sentence)

            except Exception as e:
                print ("Error with " + msg)
                print (str(e))

            tokenized_sentences.append((msg, sentences))
            time.sleep(0.1)
        
    return tokenized_sentences

In [2]:
for orig, sentences in tokenize_sentences(["Ahoj, jak se mas? Když napíšu Jakub tak to jde, ale pokud jenom jakub tak to nic nenajde. No, prostě to neni dokonalé"], use_ufal=True):
    for sent in sentences:
        print ("|".join(sent))

Ahoj|,|jak|se|mas|?
Když|napíšu|jméno|tak|to|jde|,|ale|pokud|jenom|jakub|tak|to|nic|nenajde|.
No|,|prostě|to|neni|dokonalé
