In [3]:
import nltk 
nltk.download('punkt')   
nltk.download('stopwords')  
nltk.download('wordnet')   
nltk.download('omw-1.4')   
nltk.download('averaged_perceptron_tagger')   
nltk.download('maxent_ne_chunker')   
 
nltk.download('punkt_tab')   
 
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords 
from string import punctuation 
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk import RegexpParser 
 
# Sample text 
text = """A newspaper is the strongest medium for news. People are 
reading newspapers for decades.  
It has a huge contribution to globalization. Right now because of 
easy internet connection,  
people don't read printed newspapers often. They read the online 
version.""" 
print("Sample text: \n", text, "\n") 
 
# Tokenizing by sentence 
sent_tokenized = sent_tokenize(text) 
70 
 
print("Tokenizing by sentence: \n", sent_tokenized, "\n") 
 
# Tokenizing by word 
word_tokenized = word_tokenize(text) 
print("Tokenizing by word: \n", word_tokenized, "\n") 
 
# Removing stop words and punctuation 
stop_words = set(stopwords.words('english')) 
punctuation_set = set(punctuation) 
 
print("After filtering the stop words and punctuation: ") 
filtered_words = [word for word in word_tokenized if word.casefold() 
not in stop_words and word.casefold() not in punctuation_set] 
for word in filtered_words: 
    print(word) 
 
# Stemming 
ps = PorterStemmer() 
words = ["reading", "globalization", "Being", "Went", "gone", 
"going"] 
print("\nGiven words: ", words) 
stemm = [ps.stem(i) for i in words] 
print("After stemming: ", stemm, "\n") 
 
# Lemmatization 
lem = WordNetLemmatizer() 
print("rocks:", lem.lemmatize("rocks")) 
print("corpora:", lem.lemmatize("corpora")) 
print("better:", lem.lemmatize("better")) 
print("believes:", lem.lemmatize("believes"), "\n") 
 
# Lemmatization with POS tag 
print("went as adjective:", lem.lemmatize("went", pos="a")) 
print("went as verb:", lem.lemmatize("went", pos="v")) 
print("went as noun:", lem.lemmatize("went", pos="n"), "\n") 
 
# POS tagging 
nltk.download('averaged_perceptron_tagger_eng')
postag = nltk.pos_tag(word_tokenized) 
print("POS tagging: \n") 
for i in postag: 
    print(i) 
print("\n") 
 
# Chunking 
grammar = "NP: {<DT>?<JJ>*<NN>}" 
chunker = RegexpParser(grammar) 
output = chunker.parse(postag) 
print("After Chunking:\n", output) 
output.pretty_print() 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already

Sample text: 
 A newspaper is the strongest medium for news. People are 
reading newspapers for decades.  
It has a huge contribution to globalization. Right now because of 
easy internet connection,  
people don't read printed newspapers often. They read the online 
version. 

Tokenizing by sentence: 
 ['A newspaper is the strongest medium for news.', 'People are \nreading newspapers for decades.', 'It has a huge contribution to globalization.', "Right now because of \neasy internet connection,  \npeople don't read printed newspapers often.", 'They read the online \nversion.'] 

Tokenizing by word: 
 ['A', 'newspaper', 'is', 'the', 'strongest', 'medium', 'for', 'news', '.', 'People', 'are', 'reading', 'newspapers', 'for', 'decades', '.', 'It', 'has', 'a', 'huge', 'contribution', 'to', 'globalization', '.', 'Right', 'now', 'because', 'of', 'easy', 'internet', 'connection', ',', 'people', 'do', "n't", 'read', 'printed', 'newspapers', 'often', '.', 'They', 'read', 'the', 'online', 'versi

[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


POS tagging: 

('A', 'DT')
('newspaper', 'NN')
('is', 'VBZ')
('the', 'DT')
('strongest', 'JJS')
('medium', 'NN')
('for', 'IN')
('news', 'NN')
('.', '.')
('People', 'NNS')
('are', 'VBP')
('reading', 'VBG')
('newspapers', 'NNS')
('for', 'IN')
('decades', 'NNS')
('.', '.')
('It', 'PRP')
('has', 'VBZ')
('a', 'DT')
('huge', 'JJ')
('contribution', 'NN')
('to', 'TO')
('globalization', 'NN')
('.', '.')
('Right', 'RB')
('now', 'RB')
('because', 'IN')
('of', 'IN')
('easy', 'JJ')
('internet', 'JJ')
('connection', 'NN')
(',', ',')
('people', 'NNS')
('do', 'VBP')
("n't", 'RB')
('read', 'VB')
('printed', 'JJ')
('newspapers', 'NNS')
('often', 'RB')
('.', '.')
('They', 'PRP')
('read', 'VBD')
('the', 'DT')
('online', 'JJ')
('version', 'NN')
('.', '.')


After Chunking:
 (S
  (NP A/DT newspaper/NN)
  is/VBZ
  the/DT
  strongest/JJS
  (NP medium/NN)
  for/IN
  (NP news/NN)
  ./.
  People/NNS
  are/VBP
  reading/VBG
  newspapers/NNS
  for/IN
  decades/NNS
  ./.
  It/PRP
  has/VBZ
  (NP a/DT huge/JJ contri