In [0]:
import nltk
from nltk.tokenize import word_tokenize

# We need to download the 'punkt' package to use tokenizers
nltk.download('punkt', download_dir='/tmp/')
nltk.data.path.append("tmp")

sentence = "I am not going to work today."

tokens = word_tokenize(sentence)
print(tokens)

## Tags
We can tag words using NLTK's pos_tag() function. Here is the list of tags:
* CC	coordinating conjunction
* CD	cardinal digit
* DT	determiner
* EX	existential there (like: "there is" ... think of it like "there exists")
* FW	foreign word
* IN	preposition/subordinating conjunction
* JJ	adjective	'big'
* JJR	adjective, comparative	'bigger'
* JJS	adjective, superlative	'biggest'
* LS	list marker	1)
* MD	modal	could, will
* NN	noun, singular 'desk'
* NNS	noun plural	'desks'
* NNP	proper noun, singular	'Harrison'
* NNPS	proper noun, plural	'Americans'
* PDT	predeterminer	'all the kids'
* POS	possessive ending	parent\'s
* PRP	personal pronoun	I, he, she
* PRP$	possessive pronoun	my, his, hers
* RB	adverb	very, silently,
* RBR	adverb, comparative	better
* RBS	adverb, superlative	best
* RP	particle	give up
* TO	to	go 'to' the store.
* UH	interjection	errrrrrrrm
* VB	verb, base form	take
* VBD	verb, past tense	took
* VBG	verb, gerund/present participle	taking
* VBN	verb, past participle	taken
* VBP	verb, sing. present, non-3d	take
* VBZ	verb, 3rd person sing. present	takes
* WDT	wh-determiner	which
* WP	wh-pronoun	who, what
* WP$	possessive wh-pronoun	whose
* WRB	wh-abverb	where, when

In [0]:
nltk.download('averaged_perceptron_tagger', download_dir='/tmp/')

text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

## Stop Words
We can also use some stop words that we can remove from the text.

In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords', download_dir='/tmp/')

sentence = "This is a sample sentence, let's test the stopword filtretation."
stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(sentence)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

print(word_tokens)
print(filtered_sentence)

We can see the list of stop words in NLTK library

In [0]:
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english')) 
print(stop_words)

## Stemming:
We can use NLTK's stemming library for Stemming. Stemmer converts the tokens to simpler forms by removing the end part of tokens (pluralizers).

In [0]:
import nltk
from nltk.stem import PorterStemmer

words = ["eat", "eats", "eating"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

Let's use stemmer with a sentence.

In [0]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# We need to download the 'punkt' package to use tokenizers
nltk.download('punkt', download_dir='/tmp/')
nltk.data.path.append("tmp")

ps = PorterStemmer()
 
sentence = "the children are playing and running. The weather was better yesterday."
words = word_tokenize(sentence)
 
for word in words:
    print(word + ":" + ps.stem(word))

## Lemmatization
We can use NLTK's lemma library for Lemmatization. Lemmatizer uses dictionary root to map the tokens. It also works better when the token position tag ("noun", "verb", etc. ) is provided.The first example is without word tags. 

In [0]:
# Download the wordnet package
nltk.download('wordnet', download_dir='/tmp/')
nltk.data.path.append("tmp")

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

wl = WordNetLemmatizer()

sentence = "the children are playing and running. the weather was better yesterday."
words = word_tokenize(sentence)
 
for word in words:
    print(word + ":" + wl.lemmatize(word))

Let's use the token position tags this time. Lemmatizer does a better job when we use the tag information that is also calculated using NLTK's postition tags.

In [0]:
import nltk
from nltk.corpus import wordnet

# Download the tagging package
nltk.download('averaged_perceptron_tagger', download_dir='/tmp/')
nltk.data.path.append("tmp")

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

wl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
        
sentence = "the children are playing and running. the weather was better yesterday."
words = word_tokenize(sentence)
word_pos_tags = nltk.pos_tag(words)

print("Tags", word_pos_tags)

for idx, tag in enumerate(word_pos_tags):
    print(words[idx], wl.lemmatize(tag[0], get_wordnet_pos(tag[1])))