In [1]:
# Installation of nltk
#In Jupyter, the console commands can be executed by the ‘!’ sign before the command within the cell
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
Collecting regex>=2021.8.3
  Downloading regex-2023.10.3-cp311-cp311-macosx_11_0_arm64.whl (291 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.0/291.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.10.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Text Preprocessing 
Following code can be used for text preprocessing useful for various NLP applications.

First we need to import nltk

For a given text, we can do sentence tokenization and word tokenization using nltk library functions.
We can remove the punctuations using string library.

We can then remove stop words in English to get the important words in the text.

We also perform stemming and lemmatization. Stemming and Lemmatization are two different techniques that help reduce our data space. We don’t need to check every single form of a word for reducing the size of the big data corpus.

In [4]:
#import nltk library for using its different functions
import nltk
import string
import re

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/bupadhy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bupadhy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/bupadhy/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bupadhy/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
#  Sentence Tokenization  - Tokenizes sentences from text
from nltk.tokenize import sent_tokenize

In [6]:
# Word Tokenization  - Tokenizes words in sentences
from nltk.tokenize import word_tokenize

In [7]:
statement = "Hello all, I am Dr. Chetana. Welcome to the lab session of Natural Language Processing(NLP). NLP is a very interesting area."

In [8]:
sentences = sent_tokenize(statement)
print(sentences)
words = word_tokenize(statement)
print(words)

['Hello all, I am Dr. Chetana.', 'Welcome to the lab session of Natural Language Processing(NLP).', 'NLP is a very interesting area.']
['Hello', 'all', ',', 'I', 'am', 'Dr.', 'Chetana', '.', 'Welcome', 'to', 'the', 'lab', 'session', 'of', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', '.', 'NLP', 'is', 'a', 'very', 'interesting', 'area', '.']


In [9]:
for sentence in sentences:
    print(sentence)

Hello all, I am Dr. Chetana.
Welcome to the lab session of Natural Language Processing(NLP).
NLP is a very interesting area.


In [10]:
for word in words:
    print(word)

Hello
all
,
I
am
Dr.
Chetana
.
Welcome
to
the
lab
session
of
Natural
Language
Processing
(
NLP
)
.
NLP
is
a
very
interesting
area
.


In [9]:
 # Remove punctuations
for word in words:
    if word not in string.punctuation:
        print(word)

Hello
all
I
am
Dr.
Chetana
Welcome
to
the
lab
session
of
Natural
Language
Processing
NLP
NLP
is
a
very
interesting
area


In [12]:
only_words=[w for w in words if not w in string.punctuation]
print(only_words)

['Hello', 'all', 'I', 'am', 'Dr.', 'Chetana', 'Welcome', 'to', 'the', 'lab', 'session', 'of', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'is', 'a', 'very', 'interesting', 'area']


In [13]:
#Removal of stop words from the text
from nltk.corpus import stopwords

In [14]:
# List of English stop words 
english_stop_words=set(stopwords.words("english"))
english_stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
# Removal of stop words from the text
keywords=[w for w in only_words if not w in english_stop_words]
print(keywords)

['Hello', 'I', 'Dr.', 'Chetana', 'Welcome', 'lab', 'session', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'interesting', 'area']


### Lemmatization

Lemmatization in NLP is the process through which several different forms of the same word are mapped to one single form, which we can call the root form or the base form. In more technical terms, the root form is called a lemma. By reducing the number of forms a word can take, we make sure that we reduce our data space and that we don’t have to check every single form of a word. It helps us ignore morphological variations on a single word. Lemmatization brings context to the words.So it goes a steps further by linking words with similar meaning to one word. For example if a paragraph has words like cars, trains and automobile, then it will link all of them to automobile. In the below program we use the WordNet lexical database for lemmatization.

In [21]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
keywords=[w for w in only_words if w == wordnet_lemmatizer.lemmatize(w)]
print(keywords)


['Hello', 'all', 'I', 'am', 'Dr.', 'Chetana', 'Welcome', 'to', 'the', 'lab', 'session', 'of', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'is', 'a', 'very', 'interesting', 'area']


In [18]:
wordnet_lemmatizer.lemmatize('creating')

'creating'

In [19]:
#Next find the roots of the word
for w in keywords:
    lemmatized_words=wordnet_lemmatizer.lemmatize(w)
    print(wordnet_lemmatizer.lemmatize(w))

Hello
all
I
am
Dr.
Chetana
Welcome
to
the
lab
session
of
Natural
Language
Processing
NLP
NLP
is
a
very
interesting
area


### Stemming

Stemming in NLP is the process of removing prefixes and suffixes from words so that they are reduced to simpler forms which are called stems. The purpose of stemming is to reduce our vocabulary and dimensionality for NLP tasks and to improve speed and efficiency in information retrieval and information processing tasks. Stemming is a simpler, faster process than lemmatization. The difference is that stemming is usually only rule-based approach. And, as we've showed with our earlier example, rule-based approaches can fail very quickly on more complex examples. But for most problems, it works well enough. Many search engines use stemming to improve their search results.


In [22]:
# Stemming
from nltk.stem import PorterStemmer

In [23]:
porter_stemmer = PorterStemmer()
# First Word tokenization
nltk_tokens = nltk.word_tokenize(statement)
#Next find the roots of the word
for w in keywords:
       print(porter_stemmer.stem(w))

hello
all
i
am
dr.
chetana
welcom
to
the
lab
session
of
natur
languag
process
nlp
nlp
is
a
veri
interest
area


In [24]:
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')
words = ['generous','generate','generously','generation']
for word in words:
    print(word,"--->",snowball.stem(word))

generous ---> generous
generate ---> generat
generously ---> generous
generation ---> generat


In [25]:
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
words = ['eating','eats','eaten','puts','putting']
for word in words:
    print(word,"--->",lancaster.stem(word))

eating ---> eat
eats ---> eat
eaten ---> eat
puts ---> put
putting ---> put


In [26]:
from nltk.stem import RegexpStemmer
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
words = ['mass','was','bee','computer','advisable']
for word in words:
    print(word,"--->",regexp.stem(word))

mass ---> mas
was ---> was
bee ---> bee
computer ---> computer
advisable ---> advis


In [27]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language='english')
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
word_list = ["friend", "friendship", "friends", "friendships"]
print("{0:20}{1:20}{2:20}{3:30}{4:40}".format("Word","Porter Stemmer","Snowball Stemmer","Lancaster Stemmer",'Regexp Stemmer'))
for word in word_list:
    print("{0:20}{1:20}{2:20}{3:30}{4:40}".format(word,porter.stem(word),snowball.stem(word),lancaster.stem(word),regexp.stem(word)))

Word                Porter Stemmer      Snowball Stemmer    Lancaster Stemmer             Regexp Stemmer                          
friend              friend              friend              friend                        friend                                  
friendship          friendship          friendship          friend                        friendship                              
friends             friend              friend              friend                        friend                                  
friendships         friendship          friendship          friend                        friendship                              


In [17]:
# POS Tagging

In [29]:
#print(nltk.pos_tag(keywords))
nltk.pos_tag(keywords)

[('Hello', 'NNP'),
 ('all', 'DT'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('Dr.', 'NNP'),
 ('Chetana', 'NNP'),
 ('Welcome', 'NNP'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('lab', 'NN'),
 ('session', 'NN'),
 ('of', 'IN'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('NLP', 'NNP'),
 ('NLP', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('interesting', 'JJ'),
 ('area', 'NN')]