# Assignment 3 / POS-Tagger 

In [1]:
import nltk;
import numpy as np
from nltk.corpus import brown
from nltk.tag import map_tag
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.keras import backend
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

Using TensorFlow backend.


## Preprocess Data

In [2]:
train = open('POS_GERMAN_train.txt').read()
train = train.replace("\n", "")
train = train.replace(" ", "")
train_tagged_list = train.split(";")

test = open('POS_GERMAN_minitest.txt').read()
test = test.replace("\n", "")
test = test.replace(" ", "")
test_tagged_list = test.split(";")

In [3]:
for i in range(0,len(train_tagged_list)):
    train_tagged_list[i]=train_tagged_list[i].rsplit("/", 1)
    
for i in range(0,len(test_tagged_list)):
    test_tagged_list[i]=test_tagged_list[i].rsplit("/", 1)

In [4]:
print(test_tagged_list[:100])

[['BONN', 'NE'], [',', '$,'], ['10.', 'ADJA'], ['Maerz', 'NN'], ['(', '$('], ['dpa', 'NE'], [')', '$('], ['.', '$.'], ['Qualifikation', 'NN'], ['und', 'KON'], ['Ausbildung', 'NN'], ['von', 'APPR'], ['Mitarbeitern', 'NN'], ['privater', 'ADJA'], ['Wachdienste', 'NN'], ['sind', 'VAFIN'], ['nach', 'APPR'], ['Expertenmeinung', 'NN'], ['derzeit', 'ADV'], ['unzureichend', 'ADJD'], ['.', '$.'], ['An', 'APPR'], ['sie', 'PPER'], [',', '$,'], ['einschliesslich', 'APPR'], ['Zuverlaessigkeit', 'NN'], ['der', 'ART'], ['Bediensteten', 'NN'], [',', '$,'], ['muessten', 'VMFIN'], ['hoehere', 'ADJA'], ['Anforderungen', 'NN'], ['gestellt', 'VVPP'], ['werden', 'VAINF'], [',', '$,'], ['verlangten', 'VVFIN'], ['zehn', 'CARD'], ['Sachverstaendige', 'NN'], ['bei', 'APPR'], ['einer', 'ART'], ['von', 'APPR'], ['der', 'ART'], ['SPD', 'NE'], ['beantragten', 'ADJA'], ['Anhoerung', 'NN'], ['des', 'ART'], ['Bundestags-Innenausschusses', 'NN'], ['am', 'APPRART'], ['Montag', 'NN'], ['in', 'APPR'], ['Bonn', 'NE'], ['.',

## Convert STTS to Universal

In [5]:
#https://universaldependencies.org/tagset-conversion/de-stts-uposf.html
tagset_conversion = {
    'ADJA':'ADJ',
    'ADJD':'ADJ',
    'APPR':'ADP',
    'APPRART':'ADP',
    'APPO':'ADP',
    'APZR':'ADP',
    'ART':'DET',
    'CARD':'NUM',
    'FM':'X',
    'ITJ':'INTJ',
    'KOUI':'SCONJ',
    'KOUS':'SCONJ',
    'KON':'CCONJ',
    'KOKOM':'CCONJ',
    'NN':'NOUN',
    'NE':'PROPN',
    'PDS':'PRON',
    'PDAT':'DET',
    'PIS':'PRON',
    'PIAT':'DET',
    'PIDAT':'DET',
    'PPER':'PRON',
    'PPOSS':'PRON',
    'PPOSAT':'DET',
    'PRELS':'PRON',
    'PRELAT':'DET',
    'PRF':'PRON',
    'PWS':'PRON',
    'PWAT':'DET',
    'PWAV':'ADV',
    'PAV':'ADV',
    'PTKZU':'PART',
    'PTKNEG':'PART',
    'PTKVZ':'ADP',
    'PTKANT':'PART',
    'PTKA':'PART',
    'SGML': 'X',
    'SPELL': 'X',
    'TRUNC':'X',
    'VVFIN':'VERB',
    'VVIMP':'VERB',
    'VVINF':'VERB',
    'VVIZU':'VERB',
    'VVPP':'VERB',
    'VAFIN':'AUX',
    'VAIMP':'AUX',
    'VAINF':'AUX',
    'VAPP':'AUX',
    'VMFIN':'VERB',
    'VMINF':'VERB',
    'VMPP':'VERB',
    'XY':'X',
    '$,':'PUNCT',
    '$.':'PUNCT',
    '$(':'PUNCT',
}

STTS_tagset = [*tagset_conversion]
STTS_tagset.insert(2, 'ADV') #This is required, else translating with dataframe doesn't work
print(STTS_tagset)
print('ADJA' not in STTS_tagset)

['ADJA', 'ADJD', 'ADV', 'APPR', 'APPRART', 'APPO', 'APZR', 'ART', 'CARD', 'FM', 'ITJ', 'KOUI', 'KOUS', 'KON', 'KOKOM', 'NN', 'NE', 'PDS', 'PDAT', 'PIS', 'PIAT', 'PIDAT', 'PPER', 'PPOSS', 'PPOSAT', 'PRELS', 'PRELAT', 'PRF', 'PWS', 'PWAT', 'PWAV', 'PAV', 'PTKZU', 'PTKNEG', 'PTKVZ', 'PTKANT', 'PTKA', 'SGML', 'SPELL', 'TRUNC', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP', 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'VMFIN', 'VMINF', 'VMPP', 'XY', '$,', '$.', '$(']
False


In [6]:
train_data = pd.DataFrame(train_tagged_list, columns = ['word', 'tag'])
train_data = train_data.drop(train_data[(~train_data.tag.isin(STTS_tagset))].index)
print(train_data)

test_data = pd.DataFrame(test_tagged_list, columns = ['word', 'tag'])
test_data = test_data.drop(test_data[(~test_data.tag.isin(STTS_tagset))].index)


                      word      tag
0                       ``       $(
1                     Ross       NE
2                    Perot       NE
3                    waere    VAFIN
4               vielleicht      ADV
5                      ein      ART
6              praechtiger     ADJA
7                 Diktator       NN
8                       ''       $(
9             Konzernchefs       NN
10                  lehnen    VVFIN
11                     den      ART
12             Milliardaer       NN
13                     als     APPR
14         US-Praesidenten       NN
15                      ab    PTKVZ
16                       /       $(
17                 Texaner       NN
18                    gibt    VVFIN
19                     nur      ADV
20                    vage     ADJA
21                Auskunft       NN
22                   ueber     APPR
23                   seine   PPOSAT
24      Wirtschaftspolitik       NN
25                     Der      ART
26              texanische  

In [7]:
train_data = train_data.replace({"tag": tagset_conversion})
test_data = test_data.replace({"tag": tagset_conversion})

In [8]:
train_set, dev_set = train_test_split(train_data, test_size=0.2)
test_set = test_data

## Keras

In [9]:
words, tags = set([]), set([])
 
for w in train_set.word:
    words.add(w.lower())


for t in train_set.tag:
    tags.add(t)

    
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [10]:
train_words_X, dev_words_X, train_tags_y, dev_tags_y = [], [], [], []
test_words_X, test_tags_Y = [], []
 
for w in train_set.word:
    w_int = []
    try:
        w_int.append(word2index[w.lower()])
    except KeyError:
        w_int.append(word2index['-OOV-'])
 
    train_words_X.append(w_int)

for w in dev_set.word:
    w_int = []
    try:
        w_int.append(word2index[w.lower()])
    except KeyError:
        w_int.append(word2index['-OOV-'])
 
    dev_words_X.append(w_int)

    
for t in train_set.tag:
    train_tags_y.append([tag2index[t]])

    
for t in dev_set.tag:
    dev_tags_y.append([tag2index[t]])
    

for w in test_set.word:
    w_int = []
    try:
        w_int.append(word2index[w.lower()])
    except KeyError:
        w_int.append(word2index['-OOV-'])
        
    test_words_X.append(w_int)
    
for t in test_set.tag:
    test_tags_Y.append([tag2index[t]])
    
print(tag2index.items())

dict_items([('SCONJ', 1), ('ADP', 2), ('AUX', 3), ('ADV', 4), ('PART', 5), ('VERB', 6), ('CCONJ', 7), ('INTJ', 8), ('NUM', 9), ('PUNCT', 10), ('PRON', 11), ('PROPN', 12), ('DET', 13), ('ADJ', 14), ('NOUN', 15), ('X', 16), ('-PAD-', 0)])


In [11]:
MAX_LENGTH = 1
train_words_X = pad_sequences(train_words_X, maxlen=MAX_LENGTH, padding='post')
dev_words_X = pad_sequences(dev_words_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
dev_tags_y = pad_sequences(dev_tags_y, maxlen=MAX_LENGTH, padding='post')

test_words_X = pad_sequences(test_words_X, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_Y, maxlen=MAX_LENGTH, padding='post')

In [12]:
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [13]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 128)            8195584   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1, 512)            788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 1, 17)             8721      
_________________________________________________________________
activation_1 (Activation)    (None, 1, 17)             0         
Total params: 8,992,785
Trainable params: 8,992,785
Non-trainable params: 0
_________________________________________________________________


In [14]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


## Training the model

In [15]:
model.fit(train_words_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=4, validation_split=0.2)
 

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 455713 samples, validate on 113929 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x21ec84aa2e8>

In [16]:
scores_dev = model.evaluate(dev_words_X, to_categorical(dev_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]} dev: {scores_dev[1] * 100}")

acc dev: 92.65295517898224


In [17]:
scores_test = model.evaluate(test_words_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]} test: {scores_test[1] * 100}")

acc test: 91.64168780262855
