### Part-of-Speech Tagging - First steps: Working with text files, Creating a vocabulary and Handling unknown words

In [6]:
import string
from collections import defaultdict
import numpy as np

In [2]:
# Read text data
# A tagged dataset taken from the Wall Street Journal is provided in the file WSJ_02-21.pos
# Read lines from 'WSJ_02-21.pos' file and save them into the 'lines' variable
with open("WSJ_02-21.pos", 'r') as f:
    lines = f.readlines()

In [5]:
lines[:10]

['In\tIN\n',
 'an\tDT\n',
 'Oct.\tNNP\n',
 '19\tCD\n',
 'review\tNN\n',
 'of\tIN\n',
 '``\t``\n',
 'The\tDT\n',
 'Misanthrope\tNN\n',
 "''\t''\n"]

In [3]:
# We only use 3 tags. In real word application, there are many more tags.
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
tags = ['RB', 'NN', 'TO']

In [4]:
# transition_counts: Count the number of times a particular tag happened next
# to another. The keys of this dictionary have the form (previous_tag, tag) and
# the values are the frequency of occurences.
# Define 'transition_counts' dictionary
# Note: values are the same as the ones in the assignment
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}


In [7]:
num_tags = len(tags)

transition_matrix = np.zeros((num_tags, num_tags))
transition_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [9]:
sorted_tags = sorted(tags)
sorted_tags

['NN', 'RB', 'TO']

In [10]:
for i in range(num_tags):
    for j in range(num_tags):
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        # Get frequency from transition_counts dict and assign to (i, j) position in the matrix
        transition_matrix[i, j] = transition_counts.get(tag_tuple)
transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [11]:
import pandas as pd
def print_matrix(matrix):
    print(pd.DataFrame(matrix, index = sorted_tags, columns=sorted_tags))

print_matrix(transition_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [12]:
# Working with Numpy for matrix manipulation
# Sacle transition matrix
transition_matrix = transition_matrix / 10

print_matrix(transition_matrix)

        NN     RB     TO
NN  1624.1  243.1  525.6
RB    35.8  226.3   85.5
TO    73.4   20.0    0.2


In [13]:
# axis = 1 -> sum for each row
rows_sum = transition_matrix.sum(axis = 1, keepdims=True)
rows_sum

array([[2392.8],
       [ 347.6],
       [  93.6]])

In [14]:
transition_matrix = transition_matrix / rows_sum
print_matrix(transition_matrix)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [15]:
transition_matrix.sum(axis = 1, keepdims=True)

array([[1.],
       [1.],
       [1.]])

In [16]:
# You are asked to modify each value of the diagonal of the matrix so that 
# they are equal to the log of the sum of the current row plus the current value. 
import math

# Copy transition matrix for for-loop example
t_matrix_for = np.copy(transition_matrix)

# Copy transition matrix for numpy function example
t_matrix_np = np.copy(transition_matrix)

In [17]:
for i in range(num_tags):
    t_matrix_for[i, i] = transition_matrix[i, i] + math.log(rows_sum[i])

print_matrix(t_matrix_for)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [19]:
# Using vectorization
d = np.diagonal(t_matrix_np)
d.shape

(3,)

In [None]:
t_matrix_np, d # Lấy các phần tử đường chéo

(array([[0.67874457, 0.10159646, 0.21965898],
        [0.10299194, 0.65103567, 0.24597238],
        [0.78418803, 0.21367521, 0.00213675]]),
 array([0.67874457, 0.65103567, 0.00213675]))

In [21]:
d = d + np.vectorize(math.log)(rows_sum)
np.fill_diagonal(t_matrix_np, d)

print_matrix(t_matrix_np)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  8.431255  0.245972
TO  0.784188  0.213675  7.782356
