In [15]:
import nltk
# nltk.download('all')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [10]:
text = "Mary and Samantha arrived at the bus station early but waited until noon for the bus. I looked for Mary and Samantha at the bus station."

# Tokenizing the text
## sentence tokenize 
## word  tokenize

In [11]:
from nltk.tokenize import word_tokenize,sent_tokenize
sents = sent_tokenize(text)
print(sents)

['Mary and Samantha arrived at the bus station early but waited until noon for the bus.', 'I looked for Mary and Samantha at the bus station.']


In [39]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'and', 'Samantha', 'arrived', 'at', 'the', 'bus', 'station', 'early', 'but', 'waited', 'until', 'noon', 'for', 'the', 'bus', '.'], ['I', 'looked', 'for', 'Mary', 'and', 'Samantha', 'at', 'the', 'bus', 'station', '.']]


# Removing the stopwords

In [40]:
from nltk.corpus import stopwords
from string import punctuation
customStopwords = set(stopwords.words('english')+list(punctuation))

In [41]:
# wordsWOstopwords = [words for word in word_tokenize(text) if word not in customStopwords]
# print(wordsWOstopwords)
wordsWOstopped=[]
for word in word_tokenize(text):
    if word not in customStopwords:
        wordsWOstopped.append(word)
print(wordsWOstopped)

['Mary', 'Samantha', 'arrived', 'bus', 'station', 'early', 'waited', 'noon', 'bus', 'I', 'looked', 'Mary', 'Samantha', 'bus', 'station']


# Bigrams

In [44]:
from nltk.collocations import *
bigram_measure= nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOstopped)
sorted(finder.ngram_fd.items())
# finder


[(('I', 'looked'), 1),
 (('Mary', 'Samantha'), 2),
 (('Samantha', 'arrived'), 1),
 (('Samantha', 'bus'), 1),
 (('arrived', 'bus'), 1),
 (('bus', 'I'), 1),
 (('bus', 'station'), 2),
 (('early', 'waited'), 1),
 (('looked', 'Mary'), 1),
 (('noon', 'bus'), 1),
 (('station', 'early'), 1),
 (('waited', 'noon'), 1)]

# Stemming

In [13]:
text2 = "Marry closed on closing night when she was mood to close."

from nltk.stem.lancaster import LancasterStemmer

st= LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]


print(stemmedWords)

['marry', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'mood', 'to', 'clos', '.']


# POS tagging

In [17]:
nltk.pos_tag(word_tokenize(text2))

[('Marry', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

# Disambiguating Words Meaning 

In [19]:
from nltk.corpus import wordnet as wn

for ss in  wn.synsets('bass'):
    print(ss,ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [21]:
from nltk.wsd import lesk # specfic algo to detect meaning of the word from sentences
sensel = lesk(word_tokenize("Sing in lower tone, along with the bass"),'bass')
print (sensel,sensel.definition())


Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)


In [22]:
sensel = lesk(word_tokenize("This sea bass is hard to catch"),'bass')
print (sensel,sensel.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
