# **Text Preprocessing**
# **Stemming and Lemmatization in NLP**

# **Stemming**

In [1]:
!pip install nltk



In [2]:
#nltk - natural langauge tool kit

In [3]:
from nltk.stem import PorterStemmer

In [4]:
port = PorterStemmer()

In [5]:
words = ['improve','improving','improvements','improved','improver']
words

['improve', 'improving', 'improvements', 'improved', 'improver']

In [6]:
for word in words:
  print('Before = ',word)
  print('After = ',port.stem(word))

Before =  improve
After =  improv
Before =  improving
After =  improv
Before =  improvements
After =  improv
Before =  improved
After =  improv
Before =  improver
After =  improv


In [7]:
port.stem('Moon')

'moon'

In [8]:
port.stem('physics')

'physic'

# **Lemmatization**

In [9]:
from nltk.stem import WordNetLemmatizer
import nltk

# Download WordNet
nltk.download('wordnet')

# (Optional, but recommended for POS tagging)
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [10]:
lem = WordNetLemmatizer()

In [11]:
for word in words:
    print('Before =', word)
    print('After  =', lem.lemmatize(word))

Before = improve
After  = improve
Before = improving
After  = improving
Before = improvements
After  = improvement
Before = improved
After  = improved
Before = improver
After  = improver


In [12]:
lem.lemmatize('running')

'running'

# Separate the words from sentance

In [13]:
sentence = "Stemming and Lemmatization are Text Normalization (or sometimes called Word Normalization) techniques in  the field of Natural Language Processing that are used to prepare text, words, and documents for further  processing."


In [14]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [15]:
sen = word_tokenize(sentence)
sen

['Stemming',
 'and',
 'Lemmatization',
 'are',
 'Text',
 'Normalization',
 '(',
 'or',
 'sometimes',
 'called',
 'Word',
 'Normalization',
 ')',
 'techniques',
 'in',
 'the',
 'field',
 'of',
 'Natural',
 'Language',
 'Processing',
 'that',
 'are',
 'used',
 'to',
 'prepare',
 'text',
 ',',
 'words',
 ',',
 'and',
 'documents',
 'for',
 'further',
 'processing',
 '.']

In [16]:
for word in sen:
    print(lem.lemmatize(word))

Stemming
and
Lemmatization
are
Text
Normalization
(
or
sometimes
called
Word
Normalization
)
technique
in
the
field
of
Natural
Language
Processing
that
are
used
to
prepare
text
,
word
,
and
document
for
further
processing
.


In [17]:
for word in sen:
    print(port.stem(word))

stem
and
lemmat
are
text
normal
(
or
sometim
call
word
normal
)
techniqu
in
the
field
of
natur
languag
process
that
are
use
to
prepar
text
,
word
,
and
document
for
further
process
.


In [18]:
from nltk.corpus import stopwords,words
import string

In [19]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
import nltk
nltk.download('stopwords')
stopwords.words('english') #these will delete

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [21]:
len(stopwords.words('english'))

198

In [22]:
stopwords.words('bengali')

['অতএব',
 'অথচ',
 'অথবা',
 'অনুযায়ী',
 'অনেক',
 'অনেকে',
 'অনেকেই',
 'অন্তত',
 'অন্য',
 'অবধি',
 'অবশ্য',
 'অর্থাত',
 'আই',
 'আগামী',
 'আগে',
 'আগেই',
 'আছে',
 'আজ',
 'আদ্যভাগে',
 'আপনার',
 'আপনি',
 'আবার',
 'আমরা',
 'আমাকে',
 'আমাদের',
 'আমার',
 'আমি',
 'আর',
 'আরও',
 'ই',
 'ইত্যাদি',
 'ইহা',
 'উচিত',
 'উত্তর',
 'উনি',
 'উপর',
 'উপরে',
 'এ',
 'এঁদের',
 'এঁরা',
 'এই',
 'একই',
 'একটি',
 'একবার',
 'একে',
 'এক্',
 'এখন',
 'এখনও',
 'এখানে',
 'এখানেই',
 'এটা',
 'এটাই',
 'এটি',
 'এত',
 'এতটাই',
 'এতে',
 'এদের',
 'এব',
 'এবং',
 'এবার',
 'এমন',
 'এমনকী',
 'এমনি',
 'এর',
 'এরা',
 'এল',
 'এস',
 'এসে',
 'ঐ',
 'ও',
 'ওঁদের',
 'ওঁর',
 'ওঁরা',
 'ওই',
 'ওকে',
 'ওখানে',
 'ওদের',
 'ওর',
 'ওরা',
 'কখনও',
 'কত',
 'কবে',
 'কমনে',
 'কয়েক',
 'কয়েকটি',
 'করছে',
 'করছেন',
 'করতে',
 'করবে',
 'করবেন',
 'করলে',
 'করলেন',
 'করা',
 'করাই',
 'করায়',
 'করার',
 'করি',
 'করিতে',
 'করিয়া',
 'করিয়ে',
 'করে',
 'করেই',
 'করেছিলেন',
 'করেছে',
 'করেছেন',
 'করেন',
 'কাউকে',
 'কাছ',
 'কাছে',
 'কাজ',
 'কাজে',
 'কারও',
 '

In [23]:
len(stopwords.words('bengali'))

398

In [24]:
stopwords.fileids()

['albanian',
 'arabic',
 'azerbaijani',
 'basque',
 'belarusian',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'tamil',
 'turkish']