### Tokenizing text into sentences

In [1]:
# Sentence tokenization using sent_tokenize
from nltk.tokenize import sent_tokenize
text="""Tokenization is the first step in text analytics. 
        The process of breaking down a text paragraph into smaller chunks such as words or sentence is called Tokenization.
        Token is a single entity that is building blocks for sentence or paragraph. 
        Does sentence tokenizer break text paragraph into sentences?
        What is fact?"""
sentences=sent_tokenize(text)
print(sentences)

['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentence is called Tokenization.', 'Token is a single entity that is building blocks for sentence or paragraph.', 'Does sentence tokenizer break text paragraph into sentences?', 'What is fact?']


In [6]:
# Sentence tokenization using Regular expression

import re
sentences = re.compile('[.!?] ').split(text)
print(sentences)

['Tokenization is the first step in text analytics', '\n        The process of breaking down a text paragraph into smaller chunks such as words or sentence is called Tokenization.\n        Token is a single entity that is building blocks for sentence or paragraph', '\n        Does sentence tokenizer break text paragraph into sentences?\n        What is fact?']


### Tokenizing sentences into words

In [3]:
txt="""The process of breaking down a text paragraph into smaller chunks such as 
        words or sentence is called Tokenization.Token is a single entity that 
        is building blocks for sentence or paragraph."""
from nltk.tokenize import word_tokenize
from nltk.tokenize import TreebankWordTokenizer
token=word_tokenize(txt)
print(token)

['The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentence', 'is', 'called', 'Tokenization.Token', 'is', 'a', 'single', 'entity', 'that', 'is', 'building', 'blocks', 'for', 'sentence', 'or', 'paragraph', '.']


In [4]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import WordPunctTokenizer


txt1 = "Can't is a contraction."
tokenizer1 = TreebankWordTokenizer()
token1 = word_tokenize(txt1)
token2 = tokenizer1.tokenize(txt1)
token3=wordpunct_tokenize(txt1)

print("Using word_tokenize : ",token1)
print("======================================")

print("Using TreebankWordTokenizer : ",token2)
print("======================================")

print("Using wordpunct_tokenize : ",token3)

Using word_tokenize :  ['Ca', "n't", 'is', 'a', 'contraction', '.']
Using TreebankWordTokenizer :  ['Ca', "n't", 'is', 'a', 'contraction', '.']
Using wordpunct_tokenize :  ['Can', "'", 't', 'is', 'a', 'contraction', '.']


### Tokenizing sentences into words using regular expressions

In [5]:
import re
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import RegexpTokenizer

text="Hi Everyone ! Its gr8 time."
text1=regexp_tokenize(text, pattern='\w+')
print(text1)
text2=regexp_tokenize(text, pattern='\d+')
print(text2)

text3 = re.findall("[\w]+", text)
print(text3)

['Hi', 'Everyone', 'Its', 'gr8', 'time']
['8']
['Hi', 'Everyone', 'Its', 'gr8', 'time']


In [6]:
txt1 = "Can't is a contraction."
text3 = re.findall("[\w]+", txt1)
print(text3)
text3 = re.findall("[\w']+", txt1)
print(text3)

##  Simple whitespace tokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)
list=tokenizer.tokenize("Can't is a contraction. Tokenizing sentences using regular expressions.")
print(list)

['Can', 't', 'is', 'a', 'contraction']
["Can't", 'is', 'a', 'contraction']
["Can't", 'is', 'a', 'contraction.', 'Tokenizing', 'sentences', 'using', 'regular', 'expressions.']


In [8]:
from nltk.probability import FreqDist

txt="""The process of breaking down a text paragraph into smaller chunks such as 
        words or sentence is called Tokenization.Token is a single entity that 
        is is is is building blocks for sentence or paragraph."""
token=word_tokenize(txt)

fdist = FreqDist(token)
print(fdist)
fdist.most_common(2)

<FreqDist with 26 samples and 35 outcomes>


[('is', 6), ('a', 2)]

### Stemming

In [10]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('eats'))
print(stemmer.stem('eaten'))
print(stemmer.stem('shopping'))
print(stemmer.stem('best'))

cook
cookeri
eat
eaten
shop
best


In [11]:
txt="""The process of breaking down a text paragraph into smaller chunks such as 
        words or sentence is called Tokenization. Token is a single entity that 
        is building blocks for sentence or paragraph. 
        I like fishing. I eat fish. There are many fishes in pound."""

stemmed_words=[]
for w in txt.split():
    stemmed_words.append(stemmer.stem(w))

print("Stemmed Sentence:",stemmed_words)

Stemmed Sentence: ['the', 'process', 'of', 'break', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunk', 'such', 'as', 'word', 'or', 'sentenc', 'is', 'call', 'tokenization.', 'token', 'is', 'a', 'singl', 'entiti', 'that', 'is', 'build', 'block', 'for', 'sentenc', 'or', 'paragraph.', 'i', 'like', 'fishing.', 'i', 'eat', 'fish.', 'there', 'are', 'mani', 'fish', 'in', 'pound.']


In [12]:
from nltk.stem import PorterStemmer
stemmer  = PorterStemmer()

text = "studies studying cries cry"
tokenization = word_tokenize(text)
for w in tokenization:
    print("Stemming for {} ========> {}".format(w,stemmer.stem(w)))  



In [13]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('eats'))
print(stemmer.stem('eaten'))
print(stemmer.stem('shopping'))

cook
cookery
eat
eat
shop


In [14]:
txt="""The process of breaking down a text paragraph into smaller chunks such as 
        words or sentence is called Tokenization. Token is a single entity that 
        is building blocks for sentence or paragraph. 
        I like fishing. I eat fish. There are many fishes in pound."""

stemmed_words=[]
for w in txt.split():
    stemmed_words.append(stemmer.stem(w))

print("Stemmed Sentence:",stemmed_words)

Stemmed Sentence: ['the', 'process', 'of', 'break', 'down', 'a', 'text', 'paragraph', 'into', 'smal', 'chunk', 'such', 'as', 'word', 'or', 'sent', 'is', 'cal', 'tokenization.', 'tok', 'is', 'a', 'singl', 'ent', 'that', 'is', 'build', 'block', 'for', 'sent', 'or', 'paragraph.', 'i', 'lik', 'fishing.', 'i', 'eat', 'fish.', 'ther', 'ar', 'many', 'fish', 'in', 'pound.']


In [16]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing|ed')

print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('eats'))
print(stemmer.stem('played'))
print(stemmer.stem('shopping'))
print(stemmer.stem('ingleside'))

cook
cookery
eats
play
shopp
leside


In [18]:
# nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
print("rocks  ====>", lem.lemmatize("rocks"))
print("corpora  ====>", lem.lemmatize("corpora"))
# a denotes adjective in "pos"
print("better  ====>", lem.lemmatize("better", pos ="a"))
print("cooking  ====>",lem.lemmatize('cooking'))
print("cooking  ====>",lem.lemmatize('cooking', pos='v'))
print("cooking  ====>",lem.lemmatize('cooking', pos='v'))
print("flying  ====>",lem.lemmatize('flying', pos='v'))
print("flying  ====>",lem.lemmatize('flying', pos='n'))

rocks  ====> rock
corpora  ====> corpus
better  ====> good
cooking  ====> cooking
cooking  ====> cook
cooking  ====> cook
flying  ====> fly
flying  ====> flying


In [57]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

text = "studies studying cries cry"
tokenization = word_tokenize(text)
for w in tokenization:
	print("Lemma for {} ========> {}".format(w, lem.lemmatize(w)))



In [19]:
text= """I like fishing. I eat fish. There are many fishes in pound leaves and leaf."""
tokenization = word_tokenize(text)
for w in tokenization:
	print("Lemma for {} ====> {}".format(w, lem.lemmatize(w)))

Lemma for I ====> I
Lemma for like ====> like
Lemma for fishing ====> fishing
Lemma for . ====> .
Lemma for I ====> I
Lemma for eat ====> eat
Lemma for fish ====> fish
Lemma for . ====> .
Lemma for There ====> There
Lemma for are ====> are
Lemma for many ====> many
Lemma for fishes ====> fish
Lemma for in ====> in
Lemma for pound ====> pound
Lemma for leaves ====> leaf
Lemma for and ====> and
Lemma for leaf ====> leaf
Lemma for . ====> .


### Lemmatize with POS Tag

In [47]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\balkr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [20]:
import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lem = WordNetLemmatizer()

text = "The striped bats are hanging on their feet for best.It is better then the best."
# POS tag for a given sentence
print(nltk.pos_tag(nltk.word_tokenize(text)))
print("===============================================")
# Lemmatize a Sentence with the appropriate POS tag
print([lem.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(text)])

[('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best.It', 'NN'), ('is', 'VBZ'), ('better', 'RBR'), ('then', 'RB'), ('the', 'DT'), ('best', 'JJS'), ('.', '.')]
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best.It', 'be', 'well', 'then', 'the', 'best', '.']


### Stop Words

In [21]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

{'or', 'for', 'nor', "wasn't", 'weren', 'its', 'from', 'mustn', "needn't", 've', 'didn', 'should', 'hers', 'are', 'before', "don't", 'myself', 'her', 'on', 'very', "doesn't", 'too', 'both', 'been', 'when', "you're", 'herself', 'won', 'each', "couldn't", 'shouldn', 'll', "aren't", 'above', 'had', 'then', 'because', 'again', 'below', "hadn't", 'down', 'such', 'and', 'do', "won't", "didn't", "it's", "weren't", "you'll", 'be', "isn't", 'of', 'those', 'most', "you'd", 'here', 'why', 'yourself', "shan't", 'so', 'between', 'other', 'this', 'not', 'an', 'just', 'all', 'who', 'am', 'as', 'a', 'me', 'than', 'wasn', 'how', 't', "mustn't", 'these', 's', 'o', 'his', 'our', 'by', 'ain', 're', 'hasn', 'about', 'while', 'were', "she's", 'm', 'themselves', 'but', 'in', 'ours', 'no', "you've", 'yourselves', 'they', 'over', 'some', 'ourselves', 'having', "wouldn't", 'wouldn', 'if', 'aren', 'y', 'where', "hasn't", "shouldn't", 'which', 'mightn', 'against', 'shan', 'him', 'the', 'did', 'there', 'will', 'ha

In [66]:
text = "This is a sample sentence. For example, How to develop chatbot using python?"
wordlist = [word for word in text.split() if word not in stop_words]
wordlist

['This',
 'sample',
 'sentence.',
 'For',
 'example,',
 'How',
 'develop',
 'chatbot',
 'using',
 'python?']

In [23]:
text="""The process of breaking down a text paragraph into smaller chunks such as 
        words or sentence is called Tokenization. Token is a single entity that 
        is building blocks for sentence or paragraph. 
        I like fishing. I eat fish. There are many fishes in pound. Is the on of inside a an 
        under are over there i."""

# Removing Stopwords
tokenized_sent=word_tokenize(text)

filtered_sent=[]
for w in tokenized_sent:
    if w not in stop_words:
        filtered_sent.append(w)
print("After removing Stop Words:",filtered_sent)
print("\nTokenized Sentence:",tokenized_sent)

After removing Stop Words: ['The', 'process', 'breaking', 'text', 'paragraph', 'smaller', 'chunks', 'words', 'sentence', 'called', 'Tokenization', '.', 'Token', 'single', 'entity', 'building', 'blocks', 'sentence', 'paragraph', '.', 'I', 'like', 'fishing', '.', 'I', 'eat', 'fish', '.', 'There', 'many', 'fishes', 'pound', '.', 'Is', 'inside', '.']

Tokenized Sentence: ['The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentence', 'is', 'called', 'Tokenization', '.', 'Token', 'is', 'a', 'single', 'entity', 'that', 'is', 'building', 'blocks', 'for', 'sentence', 'or', 'paragraph', '.', 'I', 'like', 'fishing', '.', 'I', 'eat', 'fish', '.', 'There', 'are', 'many', 'fishes', 'in', 'pound', '.', 'Is', 'the', 'on', 'of', 'inside', 'a', 'an', 'under', 'are', 'over', 'there', 'i', '.']


### Spell Correction using NLTK 

In [70]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\balkr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [24]:
import nltk
# importing jaccard distance and ngrams from nltk.util
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words

correct_words = words.words()
#correct_words

In [27]:
# list of incorrect spellings that need to be corrected 
incorrect_words=['happpppy', 'azmaing', 'intelliengt','apqd']
  
# loop for finding correct spellings based on jaccard distance and printing the correct word
for word in incorrect_words:
    temp = [(jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))),w) for w in correct_words if w[0]==word[0]]
    print(sorted(temp, key = lambda val:val[0])[0][1])

happy
amazing
intelligent
apa


In [28]:
# importing edit distance  
from nltk.metrics.distance  import edit_distance
from nltk.corpus import words

correct_words = words.words()

incorrect_words=['happpy', 'azmaing', 'intelliengt']
for word in incorrect_words:
    temp = [(edit_distance(word, w),w) for w in correct_words if w[0]==word[0]]
    print(sorted(temp, key = lambda val:val[0])[0][1])

happy
aiming
intelligent


In [32]:
edit_distance("rainkf","shine")

4

In [30]:
edit_distance('language', 'languege')

1