In [1]:
#Reference from https://realpython.com/nltk-nlp-python/

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
example_string = """
... Muad'Dib learned rapidly because his first training was in how to learn.
... And the first lesson of all was the basic trust that he could learn.
... It's shocking to find how many people do not believe they can learn,
... and how many more believe learning to be difficult."""

In [4]:
#Tokenizing the example string into sentences
sent_tokenize(example_string)

["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [5]:
#Tokenize the example string into words
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

In [6]:
# To Filter stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
quote = "Sir, I protest. I am not a merry man!"

In [8]:
words_in_quote = word_tokenize(quote)
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [9]:
stop_words = set(stopwords.words("english"))
filtered_list = []
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

In [10]:
filtered_list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

In [11]:
# Stemming to reduce words to their root form
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

In [12]:
words = """
... The crew of the USS Discovery discovered many discoveries.
... Discovering is what explorers do."""
# word tokenizing the string before stemming
words = word_tokenize(string_for_stemming)

NameError: name 'string_for_stemming' is not defined

In [None]:
print("String in words tokenization:", words)

String in words tokenization: ['The', 'crew', 'of', 'the', 'USS', 'Discovery', 'discovered', 'many', 'discoveries', '.', 'Discovering', 'is', 'what', 'explorers', 'do', '.']


In [None]:
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed words of the tokenizied words", stemmed_words)

Stemmed words of the tokenizied words ['the', 'crew', 'of', 'the', 'uss', 'discoveri', 'discov', 'mani', 'discoveri', '.', 'discov', 'is', 'what', 'explor', 'do', '.']


In [None]:
# POS tagging (Parts of Speech tagging)
sagan_quote = """If you wish to make an apple pie from scratch, you must first invent the universe."""

In [None]:
words_in_sagan_quote = word_tokenize(sagan_quote)

In [None]:
print(words_in_sagan_quote)

['If', 'you', 'wish', 'to', 'make', 'an', 'apple', 'pie', 'from', 'scratch', ',', 'you', 'must', 'first', 'invent', 'the', 'universe', '.']


In [None]:
import nltk
print("POS tagging of the words in the quote")
nltk.pos_tag(words_in_sagan_quote)

POS tagging of the words in the quote


[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [None]:
# TO get a list of tags and their meanings
nltk.help.upenn_tagset("VB")
nltk.help.upenn_tagset("VBP")
nltk.help.upenn_tagset("NN")
nltk.help.upenn_tagset("IN")
nltk.help.upenn_tagset("PRP")
nltk.help.upenn_tagset("DT")
nltk.help.upenn_tagset("MD")

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self s

In [None]:
jabberwocky_excerpt = "'Twas brillig, and the slithy toves did gyre and gimble in the wabe: all mimsy were the borogoves, and the mome raths outgrabe."

words_in_jabberwocky = word_tokenize(jabberwocky_excerpt)

nltk.pos_tag(words_in_jabberwocky)

[("'T", 'NN'),
 ('was', 'VBD'),
 ('brillig', 'VBN'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('slithy', 'JJ'),
 ('toves', 'NNS'),
 ('did', 'VBD'),
 ('gyre', 'NN'),
 ('and', 'CC'),
 ('gimble', 'JJ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('wabe', 'NN'),
 (':', ':'),
 ('all', 'DT'),
 ('mimsy', 'NNS'),
 ('were', 'VBD'),
 ('the', 'DT'),
 ('borogoves', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('mome', 'JJ'),
 ('raths', 'NNS'),
 ('outgrabe', 'RB'),
 ('.', '.')]

In [None]:
# Lemmatizing to reduce words to their base form with a complete English word.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("scarves")

'scarf'

In [None]:
string_for_lemmatizing = "The friends of DeSoto love scarves"
lemmatizer.lemmatize(string_for_lemmatizing)

'The friends of DeSoto love scarves'

In [None]:
words = word_tokenize(string_for_lemmatizing)
words

['The', 'friends', 'of', 'DeSoto', 'love', 'scarves']

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [None]:
lemmatized_words

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf']