## Tokenization Example - nltk

In [1]:
corpus = """
Hi my name is Ankit! I ama software developer, I work at a Tech company and I am huge AI enthusiast. I love Energy industry along with the inersection
of Telecom, AI, ML, IoT.
"""
print(corpus)


Hi my name is Ankit! I ama software developer, I work at a Tech company and I am huge AI enthusiast. I love Energy industry along with the inersection
of Telecom, AI, ML, IoT.



In [2]:
from nltk.tokenize import sent_tokenize #as in sentence (sent)

# corpus to sentences
sentence = sent_tokenize(corpus)
print(sentence)

['\nHi my name is Ankit!', 'I ama software developer, I work at a Tech company and I am huge AI enthusiast.', 'I love Energy industry along with the inersection\nof Telecom, AI, ML, IoT.']


In [3]:
from nltk.tokenize import word_tokenize
# punctuations also treated as words
word_token = word_tokenize(corpus)
print(word_token)
print(len(word_token))

['Hi', 'my', 'name', 'is', 'Ankit', '!', 'I', 'ama', 'software', 'developer', ',', 'I', 'work', 'at', 'a', 'Tech', 'company', 'and', 'I', 'am', 'huge', 'AI', 'enthusiast', '.', 'I', 'love', 'Energy', 'industry', 'along', 'with', 'the', 'inersection', 'of', 'Telecom', ',', 'AI', ',', 'ML', ',', 'IoT', '.']
41


In [4]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize = wordpunct_tokenize(corpus)
print(wordpunct_tokenize)
print(len(wordpunct_tokenize))

['Hi', 'my', 'name', 'is', 'Ankit', '!', 'I', 'ama', 'software', 'developer', ',', 'I', 'work', 'at', 'a', 'Tech', 'company', 'and', 'I', 'am', 'huge', 'AI', 'enthusiast', '.', 'I', 'love', 'Energy', 'industry', 'along', 'with', 'the', 'inersection', 'of', 'Telecom', ',', 'AI', ',', 'ML', ',', 'IoT', '.']
41


## Stemming

Reducing a word to its root or the origin word, ex. Eat, Eaten, Eating, Eater ----> Stem Word = Eat (The root word) - It removes the prefix suffix additions ina  word such as ly, ing,es etc
- PorterStemmer
- RegexStemmer
- SnowballStemmer

In [5]:
# Porter Stemming
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
words = ["eating", "eaten", "eater", "eat", 'fair', 'fairly', 'fairness', 'getting', 'gotten', 'congratulations', 'congratulate', 'congrats', 'are', 'is', 'to']
for word in words:
    print(word, "--->", stemming.stem(word))

eating ---> eat
eaten ---> eaten
eater ---> eater
eat ---> eat
fair ---> fair
fairly ---> fairli
fairness ---> fair
getting ---> get
gotten ---> gotten
congratulations ---> congratul
congratulate ---> congratul
congrats ---> congrat


In [6]:
#Regex Stemming
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$|es$|ed$|lly$|ness$')
for word in words:
    print(word, "--->", regex_stemmer.stem(word))

eating ---> eat
eaten ---> eaten
eater ---> eater
eat ---> eat
fair ---> fair
fairly ---> fairly
fairness ---> fair
getting ---> gett
gotten ---> gotten
congratulations ---> congratulations
congratulate ---> congratulate
congrats ---> congrats


In [7]:
# Snowball Stemmer
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')
for word in words:
    print(word, "--->", snowball_stemmer.stem(word))

eating ---> eat
eaten ---> eaten
eater ---> eater
eat ---> eat
fair ---> fair
fairly ---> fair
fairness ---> fair
getting ---> get
gotten ---> gotten
congratulations ---> congratul
congratulate ---> congratul
congrats ---> congrat


## Lemmatization

In [25]:
## Lemmatization - base/dictionary form of word
# lemmatize(word: str, pos: str = 'n') â†’ str
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
# words = "eating eaten eater eat fair fairly fairness getting gotten congratulations congratulate congrats are is to"
# text = "The cats were running faster than the dogs."
text = ["eating", "eaten", "eater", "eat", 'fair', 'fairly', 'fairness', 'getting', 'gotten', 'congratulations', 'congratulate', 'congrats', 'are', 'is', 'to']

lemma = [lemmatizer.lemmatize(word, pos='v') for word in text] #using pos = v as most of the words are verb
lemma

['eat',
 'eat',
 'eater',
 'eat',
 'fair',
 'fairly',
 'fairness',
 'get',
 'get',
 'congratulations',
 'congratulate',
 'congrats',
 'be',
 'be',
 'to']

## POS - Parts of Speech

In [32]:
# nltk.tag.pos_tag(tokens, tagset=None, lang='eng')
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ankit/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [33]:
sent = "The Taj Mahal is a beautifull monument"
print(pos_tag(word_tokenize(sent), tagset='universal')) #tagset to get universally understood taggin => rather NN we get NOUN

[('The', 'DET'), ('Taj', 'NOUN'), ('Mahal', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('beautifull', 'ADJ'), ('monument', 'NOUN')]


## NAMED ENTITY RECOGNITION

In [36]:
# nltk.chunk.ne_chunk(tagged_tokens, binary=False)
import nltk
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tokenize import word_tokenize

In [52]:
sent = """ India buys 2 million bpd Russian oil in August
The increase in Russian flow was at the cost of purchases from Iraq, which declined to 730,000 bpd in August """
word = word_tokenize(sent)


#Removing stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in word if word not in stop_words]
words

['India',
 'buys',
 '2',
 'million',
 'bpd',
 'Russian',
 'oil',
 'August',
 'The',
 'increase',
 'Russian',
 'flow',
 'cost',
 'purchases',
 'Iraq',
 ',',
 'declined',
 '730,000',
 'bpd',
 'August']

In [53]:
tag_pos = pos_tag(words, tagset='universal')
tag_pos

[('India', 'NOUN'),
 ('buys', 'VERB'),
 ('2', 'NUM'),
 ('million', 'NUM'),
 ('bpd', 'NOUN'),
 ('Russian', 'ADJ'),
 ('oil', 'NOUN'),
 ('August', 'NOUN'),
 ('The', 'DET'),
 ('increase', 'NOUN'),
 ('Russian', 'NOUN'),
 ('flow', 'NOUN'),
 ('cost', 'NOUN'),
 ('purchases', 'NOUN'),
 ('Iraq', 'NOUN'),
 (',', '.'),
 ('declined', 'VERB'),
 ('730,000', 'NUM'),
 ('bpd', 'NOUN'),
 ('August', 'NOUN')]

In [42]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')


[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/ankit/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /home/ankit/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [55]:
ner = ne_chunk(tag_pos)
type(ner)
# GPE indicates countries, states, cities, and places
for i in ner:
    print(i)

(GPE India/NOUN)
('buys', 'VERB')
('2', 'NUM')
('million', 'NUM')
('bpd', 'NOUN')
('Russian', 'ADJ')
('oil', 'NOUN')
('August', 'NOUN')
('The', 'DET')
('increase', 'NOUN')
('Russian', 'NOUN')
('flow', 'NOUN')
('cost', 'NOUN')
('purchases', 'NOUN')
(GPE Iraq/NOUN)
(',', '.')
('declined', 'VERB')
('730,000', 'NUM')
('bpd', 'NOUN')
('August', 'NOUN')
