In [1]:
import nltk

In [2]:
help (nltk)

Help on package nltk:

NAME
    nltk

DESCRIPTION
    The Natural Language Toolkit (NLTK) is an open source Python library
    for Natural Language Processing.  A free online book is available.
    (If you use the library for academic research, please cite the book.)

    Steven Bird, Ewan Klein, and Edward Loper (2009).
    Natural Language Processing with Python.  O'Reilly Media Inc.
    https://www.nltk.org/book/

    isort:skip_file

    @version: 3.9.2

PACKAGE CONTENTS
    app (package)
    book
    ccg (package)
    chat (package)
    chunk (package)
    classify (package)
    cli
    cluster (package)
    collections
    collocations
    compat
    corpus (package)
    data
    decorators
    downloader
    draw (package)
    featstruct
    grammar
    help
    inference (package)
    internals
    jsontags
    langnames
    lazyimport
    lm (package)
    metrics (package)
    misc (package)
    parse (package)
    probability
    sem (package)
    sentiment (package)
    stem

In [3]:
#nltk.download('stopwords')

Find The Stop Words
===

In [4]:
# Identify Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [5]:
len(stop)

198

In [6]:
#nltk.download('punkt_tab')

Tokeninzation with N-Grams
===

In [7]:
# tokenization

sent = "Luminar technolab is a software training institute"

from nltk.tokenize import word_tokenize
from nltk.util import ngrams
ng = ngrams(sequence=nltk.word_tokenize(sent),n=2)
for i in ng:
    print(i)


('Luminar', 'technolab')
('technolab', 'is')
('is', 'a')
('a', 'software')
('software', 'training')
('training', 'institute')


In [8]:
review = '''The Natural Language Toolkit (NLTK) is an open source Python library
    for Natural Language Processing.  A free online book is available.'''

rev_tok = word_tokenize(review)

In [9]:
final = [i.lower() for i in rev_tok if i.lower() not in stop]


In [10]:
final

['natural',
 'language',
 'toolkit',
 '(',
 'nltk',
 ')',
 'open',
 'source',
 'python',
 'library',
 'natural',
 'language',
 'processing',
 '.',
 'free',
 'online',
 'book',
 'available',
 '.']

Stemming
===
Two Stemmers are used
---
* PorterStemmer
* SnowballStemmer

PorterStemmer
---

In [11]:
from nltk.stem import PorterStemmer
data = PorterStemmer()
lst = ['programs','reached','running','eating','catching','throwing','jumping','bought']

for i in lst:
    print(i,":",data.stem(i))

programs : program
reached : reach
running : run
eating : eat
catching : catch
throwing : throw
jumping : jump
bought : bought


SnowballStemmer
---

In [12]:
from nltk.stem import SnowballStemmer

data = SnowballStemmer(language='english')

for i in lst:
    print(i,":",data.stem(i))

programs : program
reached : reach
running : run
eating : eat
catching : catch
throwing : throw
jumping : jump
bought : bought


Lemmatization
===

In [13]:
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet')
# nltk.download('omw-1.4')

lem = WordNetLemmatizer()
print("Rocks",':',lem.lemmatize("rocks"))
print("Reaches",':',lem.lemmatize("reaches"))
print("Running",':',lem.lemmatize("running"))

Rocks : rock
Reaches : reach
Running : running


Removing Special Characters
===


In [14]:
import re
str1 = 'Luminar technolab @# is IT finishing school 12345 ! located @ kakkanad!'

str2 = re.sub('[^a-zA-Z0-9]',' ',str1)
str2

'Luminar technolab    is IT finishing school 12345   located   kakkanad '

In [15]:
sentence = '''

Great product! I’ve been using it for 3 weeks now, and it’s definitely worth the $25—totally exceeded my expectations.  
However, the packaging wasn’t great; there were scratches on the lid & a few dents—maybe improve quality control next time.


'''



In [16]:
from nltk.corpus import stopwords
stopword = stopwords.words('english')

In [17]:
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

tokens = word_tokenize(sentence)

lst = [i for i in tokens if i.lower() not in stopword]
lst

['Great',
 'product',
 '!',
 '’',
 'using',
 '3',
 'weeks',
 ',',
 '’',
 'definitely',
 'worth',
 '$',
 '25—totally',
 'exceeded',
 'expectations',
 '.',
 'However',
 ',',
 'packaging',
 '’',
 'great',
 ';',
 'scratches',
 'lid',
 '&',
 'dents—maybe',
 'improve',
 'quality',
 'control',
 'next',
 'time',
 '.']

In [18]:
from nltk.stem import PorterStemmer

data = PorterStemmer()

for i in range(len(lst)):
    lst[i] = data.stem(lst[i])



In [19]:
lst

['great',
 'product',
 '!',
 '’',
 'use',
 '3',
 'week',
 ',',
 '’',
 'definit',
 'worth',
 '$',
 '25—total',
 'exceed',
 'expect',
 '.',
 'howev',
 ',',
 'packag',
 '’',
 'great',
 ';',
 'scratch',
 'lid',
 '&',
 'dents—mayb',
 'improv',
 'qualiti',
 'control',
 'next',
 'time',
 '.']

In [20]:
lst2 = [re.sub('[^a-zA-Z0-9]','',i)  for i in lst ]

In [21]:
lst2

['great',
 'product',
 '',
 '',
 'use',
 '3',
 'week',
 '',
 '',
 'definit',
 'worth',
 '',
 '25total',
 'exceed',
 'expect',
 '',
 'howev',
 '',
 'packag',
 '',
 'great',
 '',
 'scratch',
 'lid',
 '',
 'dentsmayb',
 'improv',
 'qualiti',
 'control',
 'next',
 'time',
 '']

In [22]:
lst

['great',
 'product',
 '!',
 '’',
 'use',
 '3',
 'week',
 ',',
 '’',
 'definit',
 'worth',
 '$',
 '25—total',
 'exceed',
 'expect',
 '.',
 'howev',
 ',',
 'packag',
 '’',
 'great',
 ';',
 'scratch',
 'lid',
 '&',
 'dents—mayb',
 'improv',
 'qualiti',
 'control',
 'next',
 'time',
 '.']