In [19]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
import json
import os
import re
from pymorphy2 import MorphAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from pymystem3 import Mystem
from flair.data import Sentence
from flair.models import SequenceTagger
import spacy

morph = MorphAnalyzer()

## Русский

Некоторые модули не работали в колабе, а другие локально, поэтому код совмещенный (но он честно работает). Для русского взяла текст с кучей омографов(разных слов, которые пишутся одинаково), т.к однозначно различить их по тексту можно только зная окружающий контекст.

In [None]:
with open('rus_text.txt',encoding='utf-8') as f:
    text = f.read()
words = [w.lower() for w in word_tokenize(text) if w.isalpha()]
doc = Doc(' '.join(words))

In [None]:
text

'Пойдем, сходим к берегу. Я берегу деньги на черный день. Нам любые дороги дороги. Солнце село, стало темно, собралось всё село. Внизу огромная пропасть, не хотелось бы в нее попасть и пропасть. Печь пирожки в печи. Видны голубые дали. Ему дали совет. Время и стекло. Вода стекла на пол. Мой яблоко. Иванов пил сок. Никто не нашёл острых пил. Ветер стих. Сорок стихов. Сейчас – начало осени. Начало смеркаться. Ели растут в лесу. Мы ели сыр и выпили много вина. Это не твоя вина, что воздух сыр. Мы видим солнце, море и чаек над водой. Совсем уже не видим месяц за облаками.'

### Наташа

In [None]:
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
segmenter = Segmenter()

In [None]:
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
nat_pos_list = []
for token in doc.tokens:
    nat_pos_list.append(token.pos)

### Mystem

In [None]:
#mystem работает в тысячу раз быстрее через консоль
inp = 'rus_text.txt'
outp = "rus_text.json"
mystem_path = os.path.join('/Users/User', 'mystem.exe')

input_filename = os.path.abspath(inp)
output_filename = os.path.abspath(outp)
os.system(f"{mystem_path} {'-gin'} {'--format json'} {input_filename} {output_filename}") 

0

In [None]:
ana_list = []
myst_pos_list = []
with open('rus_text.json', encoding='utf-8') as f:
    text2 = f.read()
    lines = text2.splitlines()

for line in lines:
    data = json.loads(line)['analysis']
    ana_list.append(data)
    
for i in range(len(ana_list)):
    if ana_list[i] == []:
        pos = 'UNK'
    else:
        pos = re.match('.+?(?=[,=])',ana_list[i][0]['gr']).group()
    myst_pos_list.append(pos)

### Pymorphy

In [None]:
pym_pos_list = []

for word in words:
    ana = morph.parse(word)
    first = ana[0] # использую только 1-й вариант разбора
    pos = first.tag.POS  
    pym_pos_list.append(str(pos))

### Accuracy

In [None]:
#текст с моей ручной разметкой
with open('gold.txt',encoding='utf-8') as f:
    gold = f.read()
    gold = gold.split('\n') 

In [25]:
def convert(pos_list):
    conv_pos_list = []
    for tag in pos_list:
        if tag in ['A','ADJ','ADJF','ADJS','JJ','JJR','JJS']:
            new_tag = 'ADJ'
        elif tag in ['S','NOUN','NN','NNS']:
            new_tag = 'NOUN'
        elif tag in ['V','VERB','INFN','VBG','MD','VB','VBD','VBN','VBP','VBZ']:
            new_tag = 'VERB'
        elif tag in ['NPRO','PRON','PRP']:
            new_tag = 'PRON'
        elif tag in ['CONJ','CCONJ','SCONJ','CC']:
            new_tag = 'CONJ'
        elif tag in ['PREP','ADP','PR','IN','RP']:
            new_tag = 'ADP'
        elif tag in ['ADV','ADVB','RB','RBR']:
            new_tag = 'ADV'
        elif tag in ['PART','PRCL','POS','TO']:
            new_tag = 'PART'
        elif tag in ['PROPN','NNP','NNPS']:
            new_tag = 'PROPN'
        elif tag in ['NUM','NUMR','CD']:
            new_tag = 'NUM'
        elif tag in ['AUX']:
            new_tag = 'AUX'
        elif tag in ['DET','DT','PDT','PRP$']:
            new_tag = 'DET'
        
        else:
            new_tag = 'OTHER'
        conv_pos_list.append(new_tag) 
    return conv_pos_list

In [None]:
print('Mystem',"Accuracy: %.4f" % accuracy_score(convert(myst_pos_list), gold))
print('Pymorphy',"Accuracy: %.4f" % accuracy_score(convert(pym_pos_list), gold))
print('Natasha',"Accuracy: %.4f" % accuracy_score(convert(nat_pos_list), gold))

Mystem Accuracy: 0.7327
Pymorphy Accuracy: 0.8020
Natasha Accuracy: 0.7822


## Английский 

С английским тоже взяла текст с кучей омографов.

In [10]:
with open('homographes.txt',encoding='utf-8') as f:
    text = f.read()

words = [w.lower() for w in word_tokenize(text) if w.isalpha()]

In [11]:
text

'Gold is heavier than lead. They are going to mass this morning. The tiger was now so close that I could smell it... I will be there in a minute. That is a very minute amount. Wind your watch. The wind howled through the woodlands. The mass is sufficient for nuclear fission. The mother duck will lead her ducklings around. "Will you close that door!" In the word ‘dinner’ the accent is on the first syllable. Andy was a good husband, and Nicky was clearly very content. We all have different learned responses to anger. The plane left for Dallas last night. Look left and right before you cross the road.'

### NLTK

In [12]:
nltk_pos_list = []
tagged = nltk.pos_tag(words)
for word in tagged:
    pos = word[1]
    nltk_pos_list.append(pos)

### Flair

In [17]:
tagger = SequenceTagger.load('upos')

2020-10-18 20:42:25,106 https://nlp.informatik.hu-berlin.de/resources/models/upos/en-pos-ontonotes-v0.4.pt not found in cache, downloading to /tmp/tmp25_aexok


100%|██████████| 432218302/432218302 [00:19<00:00, 22682550.62B/s]

2020-10-18 20:42:44,661 copying /tmp/tmp25_aexok to cache at /root/.flair/models/en-pos-ontonotes-v0.4.pt





2020-10-18 20:42:45,868 removing temp file /tmp/tmp25_aexok
2020-10-18 20:42:45,919 loading file /root/.flair/models/en-pos-ontonotes-v0.4.pt


In [18]:
flair_pos_list = []
for word in words:
    sentence = Sentence(word)
    tagger.predict(sentence)
    ana = sentence.to_tagged_string()
    pos = ana.split()[1]
    pos = re.sub('[<>]','', pos)
    flair_pos_list.append(pos)

### Spacy

In [20]:
import spacy

In [21]:
spacy_pos_list = []
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

for word in doc:
    if word.pos_ != 'PUNCT':
        spacy_pos_list.append(word.pos_)

### Accuracy

In [23]:
#текст с моей ручной разметкой
with open('eng_gold.txt',encoding='utf-8') as f:
    eng_gold = f.read()
    eng_gold = eng_gold.split('\n')

In [28]:
print('NLTK',"Accuracy: %.4f" % accuracy_score(convert(nltk_pos_list), eng_gold))
print('Flair',"Accuracy: %.4f" % accuracy_score(convert(flair_pos_list), eng_gold))
print('Spacy',"Accuracy: %.4f" % accuracy_score(convert(spacy_pos_list), eng_gold))

NLTK Accuracy: 0.7838
Flair Accuracy: 0.6577
Spacy Accuracy: 0.9640


Spacy получился самым эффективным (по крайней мере, на этом тексте)