# Tokenization

In [6]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [12]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 

In [10]:
ecg_text = """An electrocardiogram is used to record the electrical 
              conduction through a person\'s heart. The readings can 
              be used to diagnose cardiac arrhythmias."""

tokenized_by_word = word_tokenize(ecg_text)
tokenized_by_sentence = sent_tokenize(ecg_text)

print('Word Tokenization:')
print(tokenized_by_word)

print('Sentence Tokenization:')
print(tokenized_by_sentence)

Word Tokenization:
['An', 'electrocardiogram', 'is', 'used', 'to', 'record', 'the', 'electrical', 'conduction', 'through', 'a', 'person', "'s", 'heart', '.', 'The', 'readings', 'can', 'be', 'used', 'to', 'diagnose', 'cardiac', 'arrhythmias', '.']
Sentence Tokenization:
["An electrocardiogram is used to record the electrical \n              conduction through a person's heart.", 'The readings can \n              be used to diagnose cardiac arrhythmias.']


# Normalization



In [11]:
brands = 'Salvation Army, YMCA, Boys & Girls Club of America'

brands_lower = brands.lower()
brands_upper = brands.upper()

print(f'Lowercased brands: {brands_lower}')
print(f'Uppercased brands: {brands_upper}')

Lowercased brands: salvation army, ymca, boys & girls club of america
Uppercased brands: SALVATION ARMY, YMCA, BOYS & GIRLS CLUB OF AMERICA


# Stopword Removal

In [14]:
# Save all English stopwords to the variable ``stop_words``
stop_words = set(stopwords.words('english')) 

survey_text = 'A YouGov study found that American\'s like Italian food more than any other country\'s cuisine.'

tokenized_survey = word_tokenize(survey_text)

text_no_stops = [word for word in tokenized_survey if word not in stop_words]

print(f'Words Tokenized: {tokenized_survey}')
print('\n')
print(f'Text without Stops: {text_no_stops}')


Words Tokenized: ['A', 'YouGov', 'study', 'found', 'that', 'American', "'s", 'like', 'Italian', 'food', 'more', 'than', 'any', 'other', 'country', "'s", 'cuisine', '.']


Text without Stops: ['A', 'YouGov', 'study', 'found', 'American', "'s", 'like', 'Italian', 'food', 'country', "'s", 'cuisine', '.']


# Stemming

In natural language processing, stemming is the text preprocessing normalization task concerned with bluntly removing word affixes (prefixes and suffixes). For example, stemming would cast the word “going” to “go”. This is a common method used by search engines to improve matching between user input and website hits.

NLTK has a built-in stemmer called PorterStemmer. You can use it with a list comprehension to stem each word in a tokenized list of words.

In [16]:
# First, you must import and initialize the stemmer:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [19]:
populated_island = """Java is an Indonesian island in the Pacific Ocean. 
                      It is the most populated island in the world,
                      with over 140 million people."""

island_tokenized = word_tokenize(populated_island)
stemmed = [stemmer.stem(word) for word in island_tokenized]

In [20]:
print('Words Tokenized:')
print(island_tokenized)

Words Tokenized:
['Java', 'is', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'is', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']


In [21]:
print('Stemmed Words:')
print(stemmed)

Stemmed Words:
['java', 'is', 'an', 'indonesian', 'island', 'in', 'the', 'pacif', 'ocean', '.', 'it', 'is', 'the', 'most', 'popul', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'peopl', '.']


# Lemmatization

Lemmatization is a method for casting words to their root forms. This is a more involved process than stemming, because it requires the method to know the part-of-speech for each word. Since lemmatization requires the part of speech, it is a less efficient approach than stemming.

In the next exercise, we will consider how to tag each word with a part of speech. In the meantime, let’s see how to use NLTK’s lemmatize operation.

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'
tokenized_string = word_tokenize(populated_island)
lemmatized_words = [lemmatizer.lemmatize(word) for word in  tokenized_string]

In [24]:
print(f'Words Tokenized: {tokenized_string}')
print(f'Lemmatized Words: {lemmatized_words}')

Words Tokenized: ['Indonesia', 'was', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']
Lemmatized Words: ['Indonesia', 'wa', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']


# Part-of-Speech Tagging

To improve the performance of lemmatization, we need to find the part of speech for each word in our string. In script.py, to the right, we created a part-of-speech tagging function. The function accepts a word, then returns the most common part of speech for that word. Let’s break down the steps:

In [29]:
import nltk
from nltk.corpus import wordnet
from collections import Counter
from nltk.stem import WordNetLemmatizer

def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
  
    pos_counts = Counter()

    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

# Just to see how this works:
text = "Joshua is a sexy beast and can dance like a feind."
tokenized = word_tokenize(text)
for word in tokenized:
    print(word + '   :   ' + get_part_of_speech(word)) 
# So it interprets "can" and "dance" as nouns, but they are not nouns in this context

Joshua   :   n
is   :   v
a   :   n
sexy   :   a
beast   :   n
and   :   n
can   :   n
dance   :   n
like   :   v
a   :   n
feind   :   n
.   :   n


In [26]:
lemmatizer = WordNetLemmatizer()
lemmatized_pos = [lemmatizer.lemmatize(word, get_part_of_speech(word)) for word in  tokenized_string]

In [30]:
print(f'The lemmatized words are: {lemmatized_pos}')

The lemmatized words are: ['Indonesia', 'be', 'found', 'in', '1945', '.', 'It', 'contain', 'the', 'most', 'populate', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']
