In [84]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [85]:
pip install nltk



In [86]:
corpus = """ the cat's purred softly on the warm lap. Birds chirped merrily outside the window! as though.
 Colorful flowers bloomed in the garden.
 Children's laughter echoed through the park."""



In [87]:
print(corpus)

 the cat's purred softly on the warm lap. Birds chirped merrily outside the window! as though.
 Colorful flowers bloomed in the garden. 
 Children's laughter echoed through the park.


### ***Sentence Tokenizer***

In [88]:
### Will split corpus by '.' and '!'

from nltk.tokenize import sent_tokenize

document = sent_tokenize(corpus)

In [89]:
document

[" the cat's purred softly on the warm lap.",
 'Birds chirped merrily outside the window!',
 'as though.',
 'Colorful flowers bloomed in the garden.',
 "Children's laughter echoed through the park."]

### ***Word Tokenizer***

In [90]:
from nltk.tokenize import word_tokenize
words = word_tokenize(corpus)

In [91]:
### most of the characters (',', '.', '!') will be considered as a word

print(words)

['the', 'cat', "'s", 'purred', 'softly', 'on', 'the', 'warm', 'lap', '.', 'Birds', 'chirped', 'merrily', 'outside', 'the', 'window', '!', 'as', 'though', '.', 'Colorful', 'flowers', 'bloomed', 'in', 'the', 'garden', '.', 'Children', "'s", 'laughter', 'echoed', 'through', 'the', 'park', '.']


In [92]:
words

['the',
 'cat',
 "'s",
 'purred',
 'softly',
 'on',
 'the',
 'warm',
 'lap',
 '.',
 'Birds',
 'chirped',
 'merrily',
 'outside',
 'the',
 'window',
 '!',
 'as',
 'though',
 '.',
 'Colorful',
 'flowers',
 'bloomed',
 'in',
 'the',
 'garden',
 '.',
 'Children',
 "'s",
 'laughter',
 'echoed',
 'through',
 'the',
 'park',
 '.']

In [93]:
for sentence in document:
  print(word_tokenize(sentence))

['the', 'cat', "'s", 'purred', 'softly', 'on', 'the', 'warm', 'lap', '.']
['Birds', 'chirped', 'merrily', 'outside', 'the', 'window', '!']
['as', 'though', '.']
['Colorful', 'flowers', 'bloomed', 'in', 'the', 'garden', '.']
['Children', "'s", 'laughter', 'echoed', 'through', 'the', 'park', '.']


In [94]:
### Will split Corpus to words and every punctuation mark( Character ) with it as well

from nltk.tokenize import wordpunct_tokenize

wordpunct_tokenize(corpus)

['the',
 'cat',
 "'",
 's',
 'purred',
 'softly',
 'on',
 'the',
 'warm',
 'lap',
 '.',
 'Birds',
 'chirped',
 'merrily',
 'outside',
 'the',
 'window',
 '!',
 'as',
 'though',
 '.',
 'Colorful',
 'flowers',
 'bloomed',
 'in',
 'the',
 'garden',
 '.',
 'Children',
 "'",
 's',
 'laughter',
 'echoed',
 'through',
 'the',
 'park',
 '.']

### ***Stemming***

In [95]:
words = ["careful", "careless", "carefully", "caretaker", "dislike", "likely", "likeness", "unlikely", 'fairly', 'sportingly']

### ***Porter Stemmer***

In [96]:

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for word in words:
  print(word, "---------->", stemmer.stem(word) )


careful ----------> care
careless ----------> careless
carefully ----------> care
caretaker ----------> caretak
dislike ----------> dislik
likely ----------> like
likeness ----------> like
unlikely ----------> unlik
fairly ----------> fairli
sportingly ----------> sportingli


In [97]:
stemmer.stem("congratulations")

'congratul'

In [98]:
from nltk.stem import RegexpStemmer

### Ignore Spaces in the argument unless a word may also have a space in it
### The nature of the spaces will reflect in the words

### Not putting $ will refer to remove the exp no matter the nature

stemmer = RegexpStemmer('ing|s $|e$|able$', min=4)

In [99]:
stemmer.stem('Cars')

'Cars'

In [100]:
stemmer.stem('Cars ')

'Car'

In [101]:
stemmer.stem('calling')

'call'

In [102]:
stemmer.stem('calling ')

'call '

In [103]:
stemmer.stem('call ing')

'call '

In [104]:
stemmer.stem('ingcall')

'call'

### ***Snowball Stemmer***

In [105]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

In [106]:
for word in words:
  print(word, '------------>', stemmer.stem(word) )

careful ------------> care
careless ------------> careless
carefully ------------> care
caretaker ------------> caretak
dislike ------------> dislik
likely ------------> like
likeness ------------> like
unlikely ------------> unlik
fairly ------------> fair
sportingly ------------> sport


In [113]:
#### Notice the word Fair and Sport