# Part-of-speech Tagger
### Pierre Nugues

## Reading the corpus

Functions to read the sentences and split the rows of the annotated data set

In [None]:
import regex as re

def read_sentences(file):
    """
    Creates a list of sentences from the corpus
    Each sentence is a string
    :param file:
    :return:
    """
    f = open(file).read().strip()
    sentences = re.split('\n\s*\n', f)
    return sentences


def split_rows(sentences, column_names):
    """
    Creates a list of sentence where each sentence is a list of lines
    Each line is a dictionary of columns
    :param sentences:
    :param column_names:
    :return:
    """
    new_sentences = []
    for sentence in sentences:
        rows = sentence.split('\n')
        sentence = [dict(zip(column_names, row.split())) for row in rows]
        new_sentences.append(sentence)
    return new_sentences

We read the corpus

In [None]:
train_file = '../../../corpus/conll2009/en/CoNLL2009-ST-English-train-pos.txt'
test_file = '../../../corpus/conll2009/en/CoNLL2009-ST-test-words-pos.txt'

column_names = ['id', 'form', 'lemma', 'plemma', 'pos', 'ppos']

train_sentences = read_sentences(train_file)
formatted_corpus = split_rows(train_sentences, column_names)
formatted_corpus[0]

## Extracting the features

Functions to extract the features: A 5-word window centered on the current word. We pad the beginning and the end with dummy symbols.

In [None]:
def extract_features(sentences, w_size, feature_names, test=False):
    """
    Builds X matrix and y vector
    X is a list of dictionaries and y is a list
    :param sentences:
    :param w_size:
    :return:
    """
    X_l = []
    y_l = []
    if test:
        for sentence in sentences:
            X = extract_features_sent(sentence, w_size, feature_names, test)
            X_l.extend(X)
        return X_l
    else:
        for sentence in sentences:
            X, y = extract_features_sent(sentence, w_size, feature_names, test)
            X_l.extend(X)
            y_l.extend(y)
        return X_l, y_l


def extract_features_sent(sentence, w_size, feature_names, test=False):
    """
    Extract the features from one sentence
    returns X and y, where X is a list of dictionaries and
    y is a list of symbols
    :param sentence:
    :param w_size:
    :return:
    """

    # We pad the sentence to extract the context window more easily
    start = "BOS BOS BOS BOS BOS BOS\n"
    end = "\nEOS EOS EOS EOS EOS EOS"
    start *= w_size
    end *= w_size
    sentence = start + sentence
    sentence += end

    # Each sentence is a list of rows
    sentence = sentence.splitlines()
    padded_sentence = list()
    for line in sentence:
        line = line.split()
        padded_sentence.append(line)
    # print(padded_sentence)

    # We extract the features and the classes
    # X contains is a list of features, where each feature vector is a dictionary
    # y is the list of classes
    X = list()
    y = list()
    for i in range(len(padded_sentence) - 2 * w_size):
        # x is a row of X
        x = list()
        # The words in lower case
        for j in range(2 * w_size + 1):
            x.append(padded_sentence[i + j][1].lower())
        # We represent the feature vector as a dictionary
        X.append(dict(zip(feature_names, x)))
        if not test:
            # The classes are stored in a list
            y.append(padded_sentence[i + w_size][4])
    if test:
        return X
    else:
        return X, y

We extract the features from a partial data set to have a shorter training time

In [None]:
feature_names = ['word_n2', 'word_n1', 'word', 'word_p1', 'word_p2']

print("Extracting the features...")
w_size = 2
# We reduce the data set for the demonstration
train_sentences = train_sentences[:int(len(train_sentences)/5)]

X_dict, y_symbols = extract_features(train_sentences, w_size, feature_names)

We print the features to check it matches Table 8.1 in my book (second edition)

In [None]:
print(X_dict[48759:48790])
y_symbols[48759:48790]

## Vectorizing

Vectorize the feature matrix and carry out a one-hot encoding

In [None]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=True)
X = vec.fit_transform(X_dict)

Function to encode the classes

In [None]:
def encode_classes(y_symbols):
    """
    Encode the classes as numbers
    :param y_symbols:
    :return: the y vector and the lookup dictionaries
    """
    # We extract the part-of-speech names
    classes = sorted(list(set(y_symbols)))
    # We assign each name a number
    dict_classes = dict(enumerate(classes))

    # We build an inverted dictionary
    inv_dict_classes = {v: k for k, v in dict_classes.items()}

    # We convert y_symbols into a numerical vector
    y = [inv_dict_classes[i] for i in y_symbols]
    return y, dict_classes, inv_dict_classes

We encode the classes as numbers

In [None]:
y, dict_classes, inv_dict_classes = encode_classes(y_symbols)
print(dict_classes)
print(inv_dict_classes)
y[:10]

## Training a Model

We create a classifier

In [None]:
from sklearn import linear_model
classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')

And we train a model

In [None]:
model = classifier.fit(X, y)
model

## Testing the model

We read the test corpus

In [None]:
test_sentences = read_sentences(test_file)
formatted_test_corpus = split_rows(test_sentences, column_names)
formatted_test_corpus[0]

Encode the features of the test corpus and vectorize them

In [None]:
# Here we carry out a pos tag prediction and we report the per tag error
# This is done for the whole corpus without regard for the sentence structure

X_test_dict, y_test_symbols = extract_features(test_sentences, w_size, feature_names)
# Vectorize the test set and one-hot encoding
X_test = vec.transform(X_test_dict)  
y_test = [inv_dict_classes[i] if i in y_symbols else 0 for i in y_test_symbols]

And we predict the test set and measure the performance

In [None]:
from sklearn import metrics

print("Predicting the POS in the test set...")
y_test_predicted = classifier.predict(X_test)
print("Classification report for classifier %s:\n%s\n"
          % (classifier, metrics.classification_report(y_test, y_test_predicted)))

## Predicting a Sentence

In [None]:
sentence1 = """1\tthat
2\tround
3\ttable
4\tmight
5\tcollapse
6\t."""

sentence2 = """1\tthe
2\tman
3\tcan
4\tswim
5\t."""

my_sentences = [sentence1, sentence2]

for sentence in my_sentences:
    print(sentence)
    X_s_dict= extract_features([sentence], w_size, feature_names, True)
    X_s = vec.transform(X_s_dict)
    y_s = classifier.predict(X_s)
    y_symb = [dict_classes[y] for y in y_s]
    print(y_symb)