# Tokenization

In [3]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 

In [5]:
ecg_text = """An electrocardiogram is used to record the electrical 
              conduction through a person\'s heart. The readings can 
              be used to diagnose cardiac arrhythmias."""

tokenized_by_word = word_tokenize(ecg_text)
tokenized_by_sentence = sent_tokenize(ecg_text)

print('Word Tokenization:')
print(tokenized_by_word)

print('Sentence Tokenization:')
print(tokenized_by_sentence)

Word Tokenization:
['An', 'electrocardiogram', 'is', 'used', 'to', 'record', 'the', 'electrical', 'conduction', 'through', 'a', 'person', "'s", 'heart', '.', 'The', 'readings', 'can', 'be', 'used', 'to', 'diagnose', 'cardiac', 'arrhythmias', '.']
Sentence Tokenization:
["An electrocardiogram is used to record the electrical \n              conduction through a person's heart.", 'The readings can \n              be used to diagnose cardiac arrhythmias.']


# Normalization



In [6]:
brands = 'Salvation Army, YMCA, Boys & Girls Club of America'

brands_lower = brands.lower()
brands_upper = brands.upper()

print(f'Lowercased brands: {brands_lower}')
print(f'Uppercased brands: {brands_upper}')

Lowercased brands: salvation army, ymca, boys & girls club of america
Uppercased brands: SALVATION ARMY, YMCA, BOYS & GIRLS CLUB OF AMERICA


# Stopword Removal

In [7]:
# Save all English stopwords to the variable ``stop_words``
stop_words = set(stopwords.words('english')) 

survey_text = 'A YouGov study found that American\'s like Italian food more than any other country\'s cuisine.'

tokenized_survey = word_tokenize(survey_text)

text_no_stops = [word for word in tokenized_survey if word not in stop_words]

print(f'Words Tokenized: {tokenized_survey}')
print('\n')
print(f'Text without Stops: {text_no_stops}')


Words Tokenized: ['A', 'YouGov', 'study', 'found', 'that', 'American', "'s", 'like', 'Italian', 'food', 'more', 'than', 'any', 'other', 'country', "'s", 'cuisine', '.']


Text without Stops: ['A', 'YouGov', 'study', 'found', 'American', "'s", 'like', 'Italian', 'food', 'country', "'s", 'cuisine', '.']


# Stemming

In natural language processing, stemming is the text preprocessing normalization task concerned with bluntly removing word affixes (prefixes and suffixes). For example, stemming would cast the word “going” to “go”. This is a common method used by search engines to improve matching between user input and website hits.

NLTK has a built-in stemmer called PorterStemmer. You can use it with a list comprehension to stem each word in a tokenized list of words.

In [8]:
# First, you must import and initialize the stemmer:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [9]:
populated_island = """Java is an Indonesian island in the Pacific Ocean. 
                      It is the most populated island in the world,
                      with over 140 million people."""

island_tokenized = word_tokenize(populated_island)
stemmed = [stemmer.stem(word) for word in island_tokenized]

In [10]:
print('Words Tokenized:')
print(island_tokenized)

Words Tokenized:
['Java', 'is', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'is', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']


In [11]:
print('Stemmed Words:')
print(stemmed)

Stemmed Words:
['java', 'is', 'an', 'indonesian', 'island', 'in', 'the', 'pacif', 'ocean', '.', 'it', 'is', 'the', 'most', 'popul', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'peopl', '.']


# Lemmatization

Lemmatization is a method for casting words to their root forms. This is a more involved process than stemming, because it requires the method to know the part-of-speech for each word. Since lemmatization requires the part of speech, it is a less efficient approach than stemming.

In the next exercise, we will consider how to tag each word with a part of speech. In the meantime, let’s see how to use NLTK’s lemmatize operation.

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'
tokenized_string = word_tokenize(populated_island)
lemmatized_words = [lemmatizer.lemmatize(word) for word in  tokenized_string]

In [14]:
print(f'Words Tokenized: {tokenized_string}')
print(f'Lemmatized Words: {lemmatized_words}')

Words Tokenized: ['Indonesia', 'was', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']
Lemmatized Words: ['Indonesia', 'wa', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']


# Part-of-Speech Tagging

To improve the performance of lemmatization, we need to find the part of speech for each word in our string. In script.py, to the right, we created a part-of-speech tagging function. The function accepts a word, then returns the most common part of speech for that word. Let’s break down the steps:

In [16]:
import nltk
from nltk.corpus import wordnet
from collections import Counter
from nltk.stem import WordNetLemmatizer

def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
  
    pos_counts = Counter()

    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech, pos_counts

# Just to see how this works:
text = "Joshua is a sexy beast and can dance like a feind."
tokenized = word_tokenize(text)
for word in tokenized:
    pos, pos_counts = get_part_of_speech(word)
    print(word + '   :   ' + pos) 
# So it interprets "can" and "dance" as nouns, but they are not nouns in this context

Joshua   :   n
is   :   v
a   :   n
sexy   :   a
beast   :   n
and   :   n
can   :   n
dance   :   n
like   :   v
a   :   n
feind   :   n
.   :   n


In [26]:
lemmatizer = WordNetLemmatizer()
lemmatized_pos = [lemmatizer.lemmatize(word, get_part_of_speech(word)) for word in  tokenized_string]

In [30]:
print(f'The lemmatized words are: {lemmatized_pos}')

The lemmatized words are: ['Indonesia', 'be', 'found', 'in', '1945', '.', 'It', 'contain', 'the', 'most', 'populate', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']


In [20]:
word = 'dance'
probable_part_of_speech = wordnet.synsets(word)
pos, pos_counts = get_part_of_speech(word)

In [18]:
pos

'n'

In [23]:
pos_counts.most_common()

[('n', 4), ('v', 3), ('a', 0), ('r', 0)]

In [21]:
probable_part_of_speech

[Synset('dance.n.01'),
 Synset('dance.n.02'),
 Synset('dancing.n.01'),
 Synset('dance.n.04'),
 Synset('dance.v.01'),
 Synset('dance.v.02'),
 Synset('dance.v.03')]

# SpaCy

In [39]:
import spacy
# nlp = spacy.load("en_core_web_lg", disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_lg")

In [47]:
sentence = """Following mice attacks, caring farmers weren't marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors."""
# sentence = "The striped bats are hanging on their feet for best"

In [57]:
doc = nlp(sentence.replace('\n', ''))

In [58]:
" ".join([token.lemma_ for token in doc])

"follow mice attack , care farmer be n't march to Delhi for well living condition . Delhi police on Tuesday fire water cannon and teargas shell at protest farmer as they try to break barricade with their car , automobile and tractor ."

In [59]:
for token in doc:
    print(token.text + ' : ' + token.lemma_)

Following : follow
mice : mice
attacks : attack
, : ,
caring : care
farmers : farmer
were : be
n't : n't
marching : march
to : to
Delhi : Delhi
for : for
better : well
living : living
conditions : condition
. : .
Delhi : Delhi
police : police
on : on
Tuesday : Tuesday
fired : fire
water : water
cannons : cannon
and : and
teargas : teargas
shells : shell
at : at
protesting : protest
farmers : farmer
as : as
they : they
tried : try
to : to
break : break
barricades : barricade
with : with
their : their
cars : car
, : ,
automobiles : automobile
and : and
tractors : tractor
. : .


In [117]:
import string
s = """Following mice attacks, caring farmers weren't marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors."""
# s = s.translate(str.maketrans('', '', string.punctuation)).replace('\n', '')
print(s)

Following mice attacks, caring farmers weren't marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors.


In [118]:
doc = nlp(s)

In [103]:
for token in doc:
    print(token.text + ' : ' + token.lemma_)

Following : follow
mice : mice
attacks : attack
caring : care
farmers : farmer
were : be
nt : nt
marching : march
to : to
Delhi : Delhi
for : for
better : well
living : living
conditions : condition
Delhi : Delhi
police : police
on : on
Tuesday : Tuesday
fired : fire
water : water
cannons : cannon
and : and
teargas : teargas
shells : shell
at : at
protesting : protest
farmers : farmer
as : as
they : they
tried : try
to : to
break : break
barricades : barricade
with : with
their : their
cars : car
automobiles : automobile
and : and
tractors : tractor


In [81]:
len(doc)

39

In [85]:
for i in range(len(doc) - 4):
    window = range(i, i+4)
    print(window)

range(0, 4)
range(1, 5)
range(2, 6)
range(3, 7)
range(4, 8)
range(5, 9)
range(6, 10)
range(7, 11)
range(8, 12)
range(9, 13)
range(10, 14)
range(11, 15)
range(12, 16)
range(13, 17)
range(14, 18)
range(15, 19)
range(16, 20)
range(17, 21)
range(18, 22)
range(19, 23)
range(20, 24)
range(21, 25)
range(22, 26)
range(23, 27)
range(24, 28)
range(25, 29)
range(26, 30)
range(27, 31)
range(28, 32)
range(29, 33)
range(30, 34)
range(31, 35)
range(32, 36)
range(33, 37)
range(34, 38)


In [119]:
doc

Following mice attacks, caring farmers weren't marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors.

In [123]:
doc_lemmas = [token.lemma_ for token in doc]
print(doc_lemmas)
anchor = nlp('living conditions')
anchor_lemmas = [token.lemma_ for token in anchor]
print(anchor_lemmas)

['follow', 'mice', 'attack', ',', 'care', 'farmer', 'be', "n't", 'march', 'to', 'Delhi', 'for', 'well', 'living', 'condition', '.', '\n', 'Delhi', 'police', 'on', 'Tuesday', 'fire', 'water', 'cannon', 'and', 'teargas', 'shell', 'at', 'protest', 'farmer', 'as', 'they', 'try', 'to', '\n', 'break', 'barricade', 'with', 'their', 'car', ',', 'automobile', 'and', 'tractor', '.']
['living', 'condition']


In [124]:
for i in range(len(doc) - 4):
    window = (i, i+4)
    test = [(l in doc_lemmas[window[0]:window[1]]) for l in anchor_lemmas]
    if all(test):
        print(doc_lemmas[window[0]:window[1]])

['for', 'well', 'living', 'condition']
['well', 'living', 'condition', '.']
['living', 'condition', '.', '\n']


In [122]:
nlp('following mice attacks')[1].lemma_

'mice'

In [136]:
sentences = [sent for sent in doc.sents]

In [142]:
sentences

[Following mice attacks, caring farmers weren't marching to Delhi for better living conditions.,
 
 Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
 break barricades with their cars, automobiles and tractors.]

In [149]:
for sent in doc.sents:
    print([token.lemma_ for token in sent])
    print(sent.text)

['follow', 'mice', 'attack', ',', 'care', 'farmer', 'be', "n't", 'march', 'to', 'Delhi', 'for', 'well', 'living', 'condition', '.']
Following mice attacks, caring farmers weren't marching to Delhi for better living conditions.
['\n', 'Delhi', 'police', 'on', 'Tuesday', 'fire', 'water', 'cannon', 'and', 'teargas', 'shell', 'at', 'protest', 'farmer', 'as', 'they', 'try', 'to', '\n', 'break', 'barricade', 'with', 'their', 'car', ',', 'automobile', 'and', 'tractor', '.']

Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors.


['Following',
 'mice',
 'attacks',
 ',',
 'caring',
 'farmers',
 'were',
 "n't",
 'marching',
 'to',
 'Delhi',
 'for',
 'better',
 'living',
 'conditions',
 '.']

In [144]:
a = []
a.append((1,2))
a.append((3,4))
print(a)

[(1, 2), (3, 4)]
