#### Токенизация  
Токенизация — процесс разбиения текстового документа на отдельные слова, которые называются токенами.

In [91]:
text = 'Backgammon is one of the oldest known board games. \
Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. \
It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.'
    

import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

# tokens = nltk.word_tokenize(text)     # токенизация по словам
tokens = nltk.sent_tokenize(text)       # токенизация по предложениям

tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dolzh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Backgammon is one of the oldest known board games.',
 'Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.',
 'It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.']

#### Лемматизация и стемминг текста

Приведение всех форм слова к одной нормальной форме

In [92]:
text = 'dog dogs dog’s dogs’'
text = "The striped bats are hanging on their feet for best"
word = 'dogs'

import nltk
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize(word) # для одного слова

# для того, чтобы разбить предложение нужно его сначала токенизировать по словам
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
word_list = nltk.word_tokenize(text)     # токенизация по словам
word_list

# Lemmatize list of words and join
lemmatized_out = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(text)
print(lemmatized_out)

The striped bats are hanging on their feet for best
The striped bat are hanging on their foot for best


In [93]:
word = 'are'
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize(word)

'are'

необходимо добавить аргумент part-of-speech (POS-тег)

In [94]:
print(lemmatizer.lemmatize("stripes", 'v')) 
print(lemmatizer.lemmatize("stripes", 'n'))  

strip
stripe


Как получить pos-тег для каждого слова

In [95]:
print(nltk.pos_tag(['feet']))

[('feet', 'NNS')]


In [96]:
text = "The striped bats are hanging on their feet for best"
print(nltk.pos_tag(nltk.word_tokenize(text)))

[('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS')]


pos_tag() возвращает кортеж с тегом, но лемматизатор wordnet принимает тег в другом виде

In [104]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [100]:
# word = 'slowly'
# # tag = nltk.pos_tag([word])[0][1][0].upper()
# tag = nltk.pos_tag([word])[0][1][0].upper()
# print(tag)

# tag_dict = {"J": wordnet.ADJ,
#             "N": wordnet.NOUN,
#             "V": wordnet.VERB,
#             "R": wordnet.ADV}

# print(tag_dict.get(tag, wordnet.NOUN))

In [112]:
# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'are'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"

print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])


be
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']


In [118]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time
nltk.downloader.download('vader_lexicon')

sent_analyzer = SentimentIntensityAnalyzer()

# rev1 = "The hotel was very good, I love it!"
rev1 = "I'm not glad to see you!"
rev2 = "It was just horrible, the worst ever."

print(f"review 1:\n{rev1}\nScore: {sent_analyzer.polarity_scores(rev1)}")

print(f"\nreview 2:\n{rev2}\nScore: {sent_analyzer.polarity_scores(rev2)}")

review 1:
I'm not glad to see you!
Score: {'neg': 0.357, 'neu': 0.643, 'pos': 0.0, 'compound': -0.416}

review 2:
It was just horrible, the worst ever.
Score: {'neg': 0.603, 'neu': 0.397, 'pos': 0.0, 'compound': -0.8225}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dolzh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
