# POS Tagging Using CRF

## Import libraries

In [1]:
import pandas as pd       
import nltk
import sklearn
import sklearn_crfsuite
import scipy.stats
import math, string, re

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer

## Load and format data

In [17]:
def read_txt(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return lines


def format_data(text_l):
    sent_l = []
    for sent in text_l:
        tuple_l = []
        tuple_words = sent.strip('\n').split()
        for tup in tuple_words:
            split = tup.split('/')
            if len(split) < 2:
                continue
            split_word = '/'.join(split[:-1])
            split_tag = split[-1]
            
            tuple_l.append((split_word, split_tag))
        sent_l.append(tuple_l)
    return sent_l

In [18]:
train_text_l = read_txt('../../processed_data/train.txt')
test_text_l = read_txt('../../processed_data/test.txt')

train_sent_l = format_data(train_text_l)
test_sent_l = format_data(test_text_l)

In [19]:
train_sent_l[2]

[('Con', 'Nc'),
 ('đường', 'N'),
 ('thoát', 'V'),
 ('nghèo', 'A'),
 ('từ', 'E'),
 ('biển', 'N')]

## Feature define
1. The word
2. The word in lowercase
3. Length of the word
4. Prefixes and suffixes of the word of varying lengths
5. If the word is a digit
6. If the word is a punctuation mark
7. If the word is at the beginning of the sentence (BOS) or the end of the sentence (EOS) or neither
8. Features for the previous words, the following words

In [20]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.lower()': word.lower(),
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit(),
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation),
        })
    else:
        features['SOS'] = True
    
    if i > 1:
        word2 = sent[i-2][0]
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
        })
    else:
        features['EOS'] = True
    
    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.lower()': word2.lower(),
            '+2:word[:3]': word2[:3],
            '+2:word[:2]': word2[:2],
            '+2:word[-3:]': word2[-3:],
            '+2:word[-2:]': word2[-2:],
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.ispunctuation': (word2 in string.punctuation),
        })
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2words(sent):
    return [word[0] for word in sent]

In [21]:
# Extracting features
X_train = [sent2features(s) for s in train_sent_l]
y_train = [sent2labels(s) for s in train_sent_l]

X_test = [sent2features(s) for s in test_sent_l]
y_test = [sent2labels(s) for s in test_sent_l]

## Model and train

In [22]:
# Define model
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.25,
    c2 = 0.3,
    max_iterations = 100,
    all_possible_transitions=True
)

In [23]:
# Train model
crf.fit(X_train, y_train)

2281


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.25, c2=0.3, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [24]:
labels = list(crf.classes_)
labels

['M',
 ':',
 'N',
 '(',
 ')',
 'Nc',
 'V',
 'A',
 'E',
 'Np',
 ',',
 'R',
 'P',
 'X',
 '.',
 '-',
 '&',
 '...',
 'C',
 'L',
 '',
 'Nu',
 'T',
 'Ny',
 'Y',
 'm',
 '?',
 '!',
 '*',
 ';',
 'Nb',
 '------',
 'S',
 '"',
 'Vb',
 'I',
 'B',
 'v',
 '>',
 '--------',
 '.)',
 '~',
 '?]',
 ']',
 '):',
 'oOo',
 '+',
 'p',
 '-8',
 '----------',
 '----------------',
 'Ap',
 '---------',
 'Vy',
 'Ab']

In [25]:
y_pred = crf.predict(X_train)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(y_train, y_pred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(y_train, y_pred)))

print(metrics.flat_classification_report(y_train, y_pred))

F1 score on the train set = 0.9511929983581714

Accuracy on the train set = 0.939408614402917

                  precision    recall  f1-score   support

                       0.97      0.96      0.96        92
               !       1.00      1.00      1.00       454
               "       1.00      1.00      1.00       194
               &       0.97      1.00      0.98        28
               (       1.00      1.00      1.00       578
               )       1.00      1.00      1.00       577
              ):       1.00      1.00      1.00         1
               *       0.91      1.00      0.95        20
               +       1.00      1.00      1.00         1
               ,       1.00      1.00      1.00      9457
               -       1.00      1.00      1.00      1260
          ------       0.00      0.00      0.00         1
        --------       0.00      0.00      0.00         1
       ---------       0.00      0.00      0.00         1
      ----------       0.00      0

In [26]:
y_pred = crf.predict(X_test)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(y_test, y_pred)))

print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

F1 score on the test set = 0.8922032552016578

Accuracy on the test set = 0.8840114947315814

                  precision    recall  f1-score   support

               M       0.98      0.95      0.96       869
               :       1.00      1.00      1.00       138
               N       0.88      0.91      0.89      5704
               (       1.00      1.00      1.00        81
               )       1.00      1.00      1.00        82
              Nc       0.78      0.78      0.78       560
               V       0.83      0.90      0.87      4836
               A       0.81      0.78      0.80      1391
               E       0.90      0.94      0.92      1501
              Np       0.90      0.84      0.87       924
               ,       1.00      1.00      1.00      1285
               R       0.89      0.89      0.89      1792
               P       0.95      0.98      0.96       963
               X       0.72      0.61      0.66        82
               .       1.00      1.