### Parts-of-Speech Tagging - First Steps: Working with text files, Creating a Vocabulary and Handling Unknown Words

In [10]:
import string
from collections import defaultdict

In [3]:
with open("WSJ_02-21.pos", 'r') as f:
    lines = f.readlines()

In [5]:
# Print columns for reference
print("\t\tWord", "\tTag\n")

# Print first five lines of the dataset
for i in range(5):
    print(f'line number {i+1}: {lines[i]}')


		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [6]:
lines[0]

'In\tIN\n'

#### Creating a vocabulary
A vocabulary is made up of every word that appeared at least 2 times in the dataset.
Follow these steps:
- Get only the words from the dataset
- Use the defaultdict to count the number of times each word appears
- Filter the dict to only include words that appeared at least 2 times.
- Create a list out of the filtered dict
- Sort the list

In [8]:
words = [line.split('\t')[0] for line in lines]
words

['In',
 'an',
 'Oct.',
 '19',
 'review',
 'of',
 '``',
 'The',
 'Misanthrope',
 "''",
 'at',
 'Chicago',
 "'s",
 'Goodman',
 'Theatre',
 '(',
 '``',
 'Revitalized',
 'Classics',
 'Take',
 'the',
 'Stage',
 'in',
 'Windy',
 'City',
 ',',
 "''",
 'Leisure',
 '&',
 'Arts',
 ')',
 ',',
 'the',
 'role',
 'of',
 'Celimene',
 ',',
 'played',
 'by',
 'Kim',
 'Cattrall',
 ',',
 'was',
 'mistakenly',
 'attributed',
 'to',
 'Christina',
 'Haag',
 '.',
 '\n',
 'Ms.',
 'Haag',
 'plays',
 'Elianti',
 '.',
 '\n',
 'Rolls-Royce',
 'Motor',
 'Cars',
 'Inc.',
 'said',
 'it',
 'expects',
 'its',
 'U.S.',
 'sales',
 'to',
 'remain',
 'steady',
 'at',
 'about',
 '1,200',
 'cars',
 'in',
 '1990',
 '.',
 '\n',
 'The',
 'luxury',
 'auto',
 'maker',
 'last',
 'year',
 'sold',
 '1,214',
 'cars',
 'in',
 'the',
 'U.S.',
 '\n',
 'Howard',
 'Mosher',
 ',',
 'president',
 'and',
 'chief',
 'executive',
 'officer',
 ',',
 'said',
 'he',
 'anticipates',
 'growth',
 'for',
 'the',
 'luxury',
 'auto',
 'maker',
 'in',


In [12]:
# return the "zero" value of a type if you try to access a key that does not exist.
# Kinda better than dict?
freq = defaultdict(int)
for word in words:
    freq[word] += 1

In [14]:
vocab = [k for k, v in freq.items() if v > 1 and k != '\n']
vocab

['In',
 'an',
 'Oct.',
 '19',
 'review',
 'of',
 '``',
 'The',
 'Misanthrope',
 "''",
 'at',
 'Chicago',
 "'s",
 'Goodman',
 'Theatre',
 '(',
 'Take',
 'the',
 'Stage',
 'in',
 'City',
 ',',
 'Leisure',
 '&',
 'Arts',
 ')',
 'role',
 'Celimene',
 'played',
 'by',
 'Kim',
 'was',
 'mistakenly',
 'attributed',
 'to',
 'Christina',
 'Haag',
 '.',
 'Ms.',
 'plays',
 'Rolls-Royce',
 'Motor',
 'Cars',
 'Inc.',
 'said',
 'it',
 'expects',
 'its',
 'U.S.',
 'sales',
 'remain',
 'steady',
 'about',
 '1,200',
 'cars',
 '1990',
 'luxury',
 'auto',
 'maker',
 'last',
 'year',
 'sold',
 'Howard',
 'president',
 'and',
 'chief',
 'executive',
 'officer',
 'he',
 'anticipates',
 'growth',
 'for',
 'Britain',
 'Europe',
 'Far',
 'Eastern',
 'markets',
 'INDUSTRIES',
 'increased',
 'quarterly',
 '10',
 'cents',
 'from',
 'seven',
 'a',
 'share',
 'new',
 'rate',
 'will',
 'be',
 'payable',
 'Feb.',
 '15',
 'A',
 'record',
 'date',
 'has',
 "n't",
 'been',
 'set',
 'Bell',
 'based',
 'Los',
 'Angeles',


In [15]:
vocab.sort()
# Print some random values of the vocabulary
for i in range(4000, 4005):
    print(vocab[i])

Early
Earnings
Earth
Earthquake
East


### Processing new text sources
#### Dealing with unknown words
- Create a function that tries to classify the type of each unknown word and assign it a corresponding 'unknown token'
- This function will do the following checks and return an appropriate token:
    + Check if the unknown word contains any character that is a digit 
    -> return --unk_digit--
    + Check if the unknown word contains any punctuation character
    -> return --unk__punct--
    + Check if the unknown word contains any upper=case character
    -> return --unk_upper--
    + Check if the unknown word ends with a suffix that count indicate it is a noun, verb, adjective or adverb
    -> return --unk_noun--, --unk_verb--, --unk_adj--, --unk_adv-- respectively
- Else, return --unk--

In [16]:
def assign_unk(word):
    """
    Assign tokens to unknown words
    """
    
    # Punctuation characters
    # Try printing them out in a new cell!
    punct = set(string.punctuation)
    
    # Suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]

    # Loop the characters in the word, check if any is a digit
    if any(char.isdigit() for char in word):
        return "--unk_digit--"

    # Loop the characters in the word, check if any is a punctuation character
    elif any(char in punct for char in word):
        return "--unk_punct--"

    # Loop the characters in the word, check if any is an upper case character
    elif any(char.isupper() for char in word):
        return "--unk_upper--"

    # Check if word ends with any noun suffix
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Check if word ends with any verb suffix
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Check if word ends with any adjective suffix
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Check if word ends with any adverb suffix
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
    
    # If none of the previous criteria is met, return plain unknown
    return "--unk--"


A POS tagger will always encounter words that are not within the vocabulary that is being used. By augmenting the dataset to include these unknown word tokens you are helping the tagger to have a better idea of the appropriate tag for these words.

### Getting the correct tag of a word
This function should check if a line is empty and if so, it should return a placeholder word and tag, --n-- and --s-- respectively.

If not, it should process the line to return the correct word and tag pair, considering if a word is unknown in which scenario the function assign_unk() should be used.

The function is implemented next. Notice That the split() method can be used without specifying the delimiter, in which case it will default to any whitespace.

In [17]:
def get_word_tag(line, vocab):
    # If line is empty return placeholders for word and tag
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        # Split line to separate word and tag
        word, tag = line.split()
        # Check if word is not in vocabulary
        if word not in vocab: 
            # Handle unknown word
            word = assign_unk(word)
    return word, tag


In [22]:
for line in lines[:10]:
    print(get_word_tag(line, vocab))

('In', 'IN')
('an', 'DT')
('Oct.', 'NNP')
('19', 'CD')
('review', 'NN')
('of', 'IN')
('``', '``')
('The', 'DT')
('Misanthrope', 'NN')
("''", "''")


In [23]:
get_word_tag('scrutinize\tVB\n', vocab)

('--unk_verb--', 'VB')

In [24]:
get_word_tag('tardigrade\tNN\n', vocab)

('--unk--', 'NN')