### NLTK

In [1]:
## Импорт библиотеки, загрузка моделей и словаря wordnet
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\velic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\velic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\velic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Токенизация текста
text = 'The way to get started is to quit talking and begin doing'
tokenized = nltk.word_tokenize(text)
tokenized


['The',
 'way',
 'to',
 'get',
 'started',
 'is',
 'to',
 'quit',
 'talking',
 'and',
 'begin',
 'doing']

In [3]:
# разбор по частям речи
tagged = nltk.pos_tag(tokenized)
tagged

[('The', 'DT'),
 ('way', 'NN'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('started', 'VBN'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('quit', 'VB'),
 ('talking', 'VBG'),
 ('and', 'CC'),
 ('begin', 'VB'),
 ('doing', 'VBG')]

In [8]:
tagged[3][1]

'get'

In [5]:
# лемматизация
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
lemmatized = [lemm.lemmatize(word) for word in tokenized]
lemmatized


['The',
 'way',
 'to',
 'get',
 'started',
 'is',
 'to',
 'quit',
 'talking',
 'and',
 'begin',
 'doing']

In [6]:
# лемматизация с учетом части речи
# меняем теги
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif nltk_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:          
        return None

tagged

[('The', 'DT'),
 ('way', 'NN'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('started', 'VBN'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('quit', 'VB'),
 ('talking', 'VBG'),
 ('and', 'CC'),
 ('begin', 'VB'),
 ('doing', 'VBG')]

In [9]:
# лемматизируем
tagged = [(word, pos_tagger(tag)) for word, tag in tagged]
lemmatized = []
for word, tag in tagged:
    if tag == None:
        lemmatized.append(lemm.lemmatize(word))
    else:
        lemmatized.append(lemm.lemmatize(word, tag))
lemmatized


['The',
 'way',
 'to',
 'get',
 'start',
 'be',
 'to',
 'quit',
 'talk',
 'and',
 'begin',
 'do']

In [None]:
# Стемминг English
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language='english')
stemmed = [stemmer.stem(word) for word in tokenized]
stemmed
'The way to get started is to quit talking and begin doing'

['the',
 'way',
 'to',
 'get',
 'start',
 'is',
 'to',
 'quit',
 'talk',
 'and',
 'begin',
 'do']

In [None]:
# Стемминг russian
stemmer = SnowballStemmer(language='russian')
text = 'У Лукоморья дуб зеленый, златая цепь на дубе том'
tokenized = nltk.word_tokenize(text)
stemmed = [stemmer.stem(word) for word in tokenized]
stemmed

['у', 'лукомор', 'дуб', 'зелен', ',', 'злат', 'цеп', 'на', 'дуб', 'том']

### pymorphy
https://pymorphy2.readthedocs.io/en/stable/

In [23]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
morph.parse('красивые')[0].tag.POS

'ADJF'

In [25]:
text = 'ключ'
morph.parse(text)

[Parse(word='ключ', tag=OpencorporaTag('NOUN,inan,masc sing,nomn'), normal_form='ключ', score=0.76923, methods_stack=((DictionaryAnalyzer(), 'ключ', 136, 0),)),
 Parse(word='ключ', tag=OpencorporaTag('NOUN,inan,masc sing,accs'), normal_form='ключ', score=0.230769, methods_stack=((DictionaryAnalyzer(), 'ключ', 136, 3),))]