In [5]:
# Text Analysis Operations using NLTK
# Tokenization
# Stopwords
# Lexicon Normalization such as Stemming and Lemmatization
# POS Tagging


# Importing necessary library
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
# sample text for performing tokenization
text = "In Brazil they drive on the right-hand side of the road. Brazil has a large coastline on the eastern side of South America"
# importing word_tokenize from nltk
from nltk.tokenize import word_tokenize

# Tokenization
# Passing the string text into word tokenize for breaking the sentences
token = word_tokenize(text)
print(token)

['In', 'Brazil', 'they', 'drive', 'on', 'the', 'right-hand', 'side', 'of', 'the', 'road', '.', 'Brazil', 'has', 'a', 'large', 'coastline', 'on', 'the', 'eastern', 'side', 'of', 'South', 'America']


In [15]:
# Finding frequency distinct in the text

# finding the frequency distinct in the tokens
# Importing FreqDist library from nltk and passing token into FreqDist
from nltk.probability import FreqDist
fdist = FreqDist(token) 
print("Freuency distinct in the text",dict(fdist))

Freuency distinct in the text {'In': 1, 'Brazil': 2, 'they': 1, 'drive': 1, 'on': 2, 'the': 3, 'right-hand': 1, 'side': 2, 'of': 2, 'road': 1, '.': 1, 'has': 1, 'a': 1, 'large': 1, 'coastline': 1, 'eastern': 1, 'South': 1, 'America': 1}


In [16]:
# ‘the’ is found 3 times in the text, ‘Brazil’ is found 2 times in the text, etc.

# To find the frequency of top 10 words
fdist1 = fdist.most_common(10)
print("frequency of top 10 words:",dict(fdist1))

most common words: {'the': 3, 'Brazil': 2, 'on': 2, 'side': 2, 'of': 2, 'In': 1, 'they': 1, 'drive': 1, 'right-hand': 1, 'road': 1}


In [18]:
# Stemming
# Stemming usually refers to normalizing words into its base form or root form.
# 1.Porter Stemming (removes common morphological and inflectional endings from words)
# 2.Lancaster Stemming (a more aggressive stemming algorithm).


# Importing Porterstemmer from nltk library
# Checking for the word ‘giving’ 
from nltk.stem import PorterStemmer
pst = PorterStemmer()
pst.stem("waiting")

'wait'

In [19]:
# Checking for the list of words
stm = ["waited", "waiting", "waits"]
for word in stm :
   print(word+ ":" +pst.stem(word))

waited:wait
waiting:wait
waits:wait


In [20]:
# Lancaster is more aggressive than Porter stemmer
# Importing LancasterStemmer from nltk
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
stm = ["giving", "given", "given", "gave"]
for word in stm :
 print(word+ ":" + lst.stem(word))

giving:giv
given:giv
given:giv
gave:gav


In [21]:
# Checking for the list of words
stm = ["waited", "waiting", "waits"]
for word in stm :
   print(word+ ":" +lst.stem(word))

waited:wait
waiting:wait
waits:wait


In [29]:
# Lemmatization
# In simpler terms, it is the process of converting a word to its base form. 
# The difference between stemming and lemmatization is, lemmatization considers the context 
# and converts the word to its meaningful base form, whereas stemming just removes the last few characters,
# often leading to incorrect meanings and spelling errors.

# importing Lemmatizer library from nltk

from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))
print("copora :", lemmatizer.lemmatize("corpora"))

rocks : rock
copora : corpus


In [34]:
# Stop Words
# “Stop words” are the most common words in a language like “the”, “a”, “at”, “for”, “above”, “on”, “is”, “all”. 
# These words do not provide any meaning and are usually removed from texts.
# We can remove these stop words using nltk library
#nltk.download('stopwords') nltk needs to download special requirement using download method for once.

# import stopwords from nltk.corpus
from nltk import word_tokenize
from nltk.corpus import stopwords
# nltk.download('stopwords')
a= set(stopwords.words('english'))

text = "Christiano Ronaldo was born on Febraury 5, 1986, in Funchal, Madeira, Portugal"

text1 = word_tokenize(text.lower())
print("Convert all letter to lower case: ", text1)

stopwords= [x for x in text1 if x not in a]
print("/n tokens after stopwords removal: "  , stopwords)

Convert all letter to lower case:  ['christiano', 'ronaldo', 'was', 'born', 'on', 'febraury', '5', ',', '1986', ',', 'in', 'funchal', ',', 'madeira', ',', 'portugal']
tokens after stopwords removal:  ['christiano', 'ronaldo', 'born', 'febraury', '5', ',', '1986', ',', 'funchal', ',', 'madeira', ',', 'portugal']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhim.DESKTOP-3662N79\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bhim.DESKTOP-3662N79\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [41]:
# Part of speech tagging (POS)
# Part-of-speech tagging is used to assign parts of speech to each word of a given text (
#     such as nouns, verbs, pronouns, adverbs, conjunction, adjectives, interjection) 
# based on its definition and its context. 
# There are many tools available for POS taggers 
# and some of the widely used taggers are NLTK, Spacy, TextBlob, Standford CoreNLP, etc.

# nltk.download('averaged_perceptron_tagger')
text = "vote to choose a particular man or a group (party) to represent them in parliament"
#Tokenize the text
tex = word_tokenize(text)
for token in tex:
    print(nltk.pos_tag([token]))

[('vote', 'NN')]
[('to', 'TO')]
[('choose', 'NN')]
[('a', 'DT')]
[('particular', 'JJ')]
[('man', 'NN')]
[('or', 'CC')]
[('a', 'DT')]
[('group', 'NN')]
[('(', '(')]
[('party', 'NN')]
[(')', ')')]
[('to', 'TO')]
[('represent', 'NN')]
[('them', 'PRP')]
[('in', 'IN')]
[('parliament', 'NN')]


In [43]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\bhim.DESKTOP-3662N79\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [None]:
# Named entity recognition
# It is the process of detecting the named entities
# such as the person name,the location name, the company name, the quantities and the monetary value.

text = "Google's CEO Sunder Pichai introduced new Pixel at Minnesota Roi Centre Event"

#importing chunk library from nltk
from nltk import ne_chunk
# tokenize and POS Tagging before doing chunk
token = word_tokenize(text)
tags = nltk.pos_tag(token)
chunk = ne_chunk(tags)
chunk