In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Tokenization

![image-2.png](attachment:image-2.png)

In [2]:
paragraph = """Python developers are in high demand - not only because the language is so popular and widely used 
            but mostly due to the fact that Python became a solution in many different areas. From web applications to 
            data science and machine learning. However, it is not enough to just master the language itself. Surprisingly, 
            that might be the easiest step in becoming a Python developer. What else should you know to become a really good 
            one?."""

In [3]:
# Tokenizing sentences
from nltk.tokenize import sent_tokenize 
sentences = nltk.sent_tokenize(paragraph)
sentences

['Python developers are in high demand - not only because the language is so popular and widely used \n            but mostly due to the fact that Python became a solution in many different areas.',
 'From web applications to \n            data science and machine learning.',
 'However, it is not enough to just master the language itself.',
 'Surprisingly, \n            that might be the easiest step in becoming a Python developer.',
 'What else should you know to become a really good \n            one?.']

In [4]:
from nltk.tokenize import word_tokenize 
words = nltk.word_tokenize(paragraph)
words

['Python',
 'developers',
 'are',
 'in',
 'high',
 'demand',
 '-',
 'not',
 'only',
 'because',
 'the',
 'language',
 'is',
 'so',
 'popular',
 'and',
 'widely',
 'used',
 'but',
 'mostly',
 'due',
 'to',
 'the',
 'fact',
 'that',
 'Python',
 'became',
 'a',
 'solution',
 'in',
 'many',
 'different',
 'areas',
 '.',
 'From',
 'web',
 'applications',
 'to',
 'data',
 'science',
 'and',
 'machine',
 'learning',
 '.',
 'However',
 ',',
 'it',
 'is',
 'not',
 'enough',
 'to',
 'just',
 'master',
 'the',
 'language',
 'itself',
 '.',
 'Surprisingly',
 ',',
 'that',
 'might',
 'be',
 'the',
 'easiest',
 'step',
 'in',
 'becoming',
 'a',
 'Python',
 'developer',
 '.',
 'What',
 'else',
 'should',
 'you',
 'know',
 'to',
 'become',
 'a',
 'really',
 'good',
 'one',
 '?',
 '.']

Stopwords

Stopwords are the words in any language which does not add much meaning to a sentence.

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
stop_words = set(stopwords.words('english')) 

sentence_without_stopwords = [w for w in words if not w in stop_words] 
  
sentence_without_stopwords = [] 
  
for w in words: 
    if w not in stop_words: 
        sentence_without_stopwords.append(w) 
  
print("Word Tokens: " + str(words)+ '\n') 
print(len(words))
print("Sentence without stopwords: "+ str(sentence_without_stopwords))
print(len(sentence_without_stopwords))

Word Tokens: ['Python', 'developers', 'are', 'in', 'high', 'demand', '-', 'not', 'only', 'because', 'the', 'language', 'is', 'so', 'popular', 'and', 'widely', 'used', 'but', 'mostly', 'due', 'to', 'the', 'fact', 'that', 'Python', 'became', 'a', 'solution', 'in', 'many', 'different', 'areas', '.', 'From', 'web', 'applications', 'to', 'data', 'science', 'and', 'machine', 'learning', '.', 'However', ',', 'it', 'is', 'not', 'enough', 'to', 'just', 'master', 'the', 'language', 'itself', '.', 'Surprisingly', ',', 'that', 'might', 'be', 'the', 'easiest', 'step', 'in', 'becoming', 'a', 'Python', 'developer', '.', 'What', 'else', 'should', 'you', 'know', 'to', 'become', 'a', 'really', 'good', 'one', '?', '.']

84
Sentence without stopwords: ['Python', 'developers', 'high', 'demand', '-', 'language', 'popular', 'widely', 'used', 'mostly', 'due', 'fact', 'Python', 'became', 'solution', 'many', 'different', 'areas', '.', 'From', 'web', 'applications', 'data', 'science', 'machine', 'learning', '.',

Stemming and Lemmatization

![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [8]:
example = ["Change","Changing","Changes","Changed","Changer"]

In [9]:
example1 = ["was","studies","studying"]

In [10]:
# Stemming
from nltk.stem import PorterStemmer

In [11]:
obj = PorterStemmer()

In [12]:
for e in example: 
    print(e, " : ",obj.stem(e))

Change  :  chang
Changing  :  chang
Changes  :  chang
Changed  :  chang
Changer  :  changer


In [13]:
for e in example1: 
    print(e, " : ",obj.stem(e))

was  :  wa
studies  :  studi
studying  :  studi


In [14]:
# Lemmatization
from nltk.stem import WordNetLemmatizer 

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
for e in example: 
    print(e, " : ",lemmatizer.lemmatize(e))

Change  :  Change
Changing  :  Changing
Changes  :  Changes
Changed  :  Changed
Changer  :  Changer


In [18]:
for e in example1: 
    print(e, " : ",lemmatizer.lemmatize(e))

was  :  wa
studies  :  study
studying  :  studying


Parts of Speech (POS) Tagging

In [19]:
# importing tokenize library
from nltk.tokenize import word_tokenize 
from nltk import pos_tag 
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
# convert text into word_tokens with their tags 
def pos_tagg(text): 
    word_tokens = word_tokenize(text) 
    return pos_tag(word_tokens) 
  
pos_tagg('How are you?') 

[('How', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('?', '.')]

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html