# Assignment 3 / POS-Tagger 

Basic Structure from https://nlpforhackers.io/lstm-pos-tagger-keras/

In [1]:
import nltk;
import numpy as np
from nltk.corpus import brown
from nltk.tag import map_tag
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.keras import backend
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score

Using TensorFlow backend.


## Preprocess Data

In [2]:
train = open('POS_GERMAN_train.txt').read()
train = train.replace("\n", "")
train = train.replace(" ", "")
train_tagged_list = train.split(";")

#----Test Data-----------------------------------------------------------

test = open('POS_GERMAN_minitest.txt').read()
test = test.replace("\n", "")
test = test.replace(" ", "")
test_tagged_list = test.split(";")

In [3]:
for i in range(0,len(train_tagged_list)):
    train_tagged_list[i]=train_tagged_list[i].rsplit("/", 1)

#----Test Data-----------------------------------------------------------
    
for i in range(0,len(test_tagged_list)):
    test_tagged_list[i]=test_tagged_list[i].rsplit("/", 1)

In [4]:
print(test_tagged_list[:100])

[['BONN', 'NE'], [',', '$,'], ['10.', 'ADJA'], ['Maerz', 'NN'], ['(', '$('], ['dpa', 'NE'], [')', '$('], ['.', '$.'], ['Qualifikation', 'NN'], ['und', 'KON'], ['Ausbildung', 'NN'], ['von', 'APPR'], ['Mitarbeitern', 'NN'], ['privater', 'ADJA'], ['Wachdienste', 'NN'], ['sind', 'VAFIN'], ['nach', 'APPR'], ['Expertenmeinung', 'NN'], ['derzeit', 'ADV'], ['unzureichend', 'ADJD'], ['.', '$.'], ['An', 'APPR'], ['sie', 'PPER'], [',', '$,'], ['einschliesslich', 'APPR'], ['Zuverlaessigkeit', 'NN'], ['der', 'ART'], ['Bediensteten', 'NN'], [',', '$,'], ['muessten', 'VMFIN'], ['hoehere', 'ADJA'], ['Anforderungen', 'NN'], ['gestellt', 'VVPP'], ['werden', 'VAINF'], [',', '$,'], ['verlangten', 'VVFIN'], ['zehn', 'CARD'], ['Sachverstaendige', 'NN'], ['bei', 'APPR'], ['einer', 'ART'], ['von', 'APPR'], ['der', 'ART'], ['SPD', 'NE'], ['beantragten', 'ADJA'], ['Anhoerung', 'NN'], ['des', 'ART'], ['Bundestags-Innenausschusses', 'NN'], ['am', 'APPRART'], ['Montag', 'NN'], ['in', 'APPR'], ['Bonn', 'NE'], ['.',

## Convert STTS to Universal
Die Konvertierung ist mit Hilfe von folgender Tabelle vorgenommen worden: 
https://universaldependencies.org/tagset-conversion/de-stts-uposf.html

Da hier aber vermerkt wird, dass die Tabelle fehlerhaft sein kann, habe ich die einzelnen Einträge Stück für Stück auf mögliche schwerwiegende Fehler überprüft (z.B. dass Artikel als Verben abgebildet sind). Zum Beispiel hatte ich Zweifel bei attributierenden Possessiv- und Relativpronomen, da diese nicht als Pronomen abgebildet sind. DET ist allerdigs, nachdem ich dessen Erläuterung gelesen hatte zutreffender als Pronomen.
Zum Abschluss musste ich noch die Liste ergänzen mit SGML (markup) und SPELL (Buchstabierreihenfolge) ergänzen, da diese fehlend waren. Beide werden auf X abgebildet, da dies nicht wirklich Wortarten sind und kein anderer Tag zutreffend ist.

In [5]:
#https://universaldependencies.org/tagset-conversion/de-stts-uposf.html
tagset_conversion = {
    'ADJA':'ADJ',
    'ADJD':'ADJ',
    'APPR':'ADP',
    'APPRART':'ADP',
    'APPO':'ADP',
    'APZR':'ADP',
    'ART':'DET',
    'CARD':'NUM',
    'FM':'X',
    'ITJ':'INTJ',
    'KOUI':'SCONJ',
    'KOUS':'SCONJ',
    'KON':'CCONJ',
    'KOKOM':'CCONJ',
    'NN':'NOUN',
    'NE':'PROPN',
    'PDS':'PRON',
    'PDAT':'DET',
    'PIS':'PRON',
    'PIAT':'DET',
    'PIDAT':'DET',
    'PPER':'PRON',
    'PPOSS':'PRON',
    'PPOSAT':'DET',
    'PRELS':'PRON',
    'PRELAT':'DET',
    'PRF':'PRON',
    'PWS':'PRON',
    'PWAT':'DET',
    'PWAV':'ADV',
    'PAV':'ADV',
    'PTKZU':'PART',
    'PTKNEG':'PART',
    'PTKVZ':'ADP',
    'PTKANT':'PART',
    'PTKA':'PART',
    'SGML': 'X',
    'SPELL': 'X',
    'TRUNC':'X',
    'VVFIN':'VERB',
    'VVIMP':'VERB',
    'VVINF':'VERB',
    'VVIZU':'VERB',
    'VVPP':'VERB',
    'VAFIN':'AUX',
    'VAIMP':'AUX',
    'VAINF':'AUX',
    'VAPP':'AUX',
    'VMFIN':'VERB',
    'VMINF':'VERB',
    'VMPP':'VERB',
    'XY':'X',
    '$,':'PUNCT',
    '$.':'PUNCT',
    '$(':'PUNCT',
}

STTS_tagset = [*tagset_conversion]
STTS_tagset.insert(2, 'ADV') #This is required, else translating with dataframe doesn't work
print(STTS_tagset)
print('ADJA' not in STTS_tagset)

['ADJA', 'ADJD', 'ADV', 'APPR', 'APPRART', 'APPO', 'APZR', 'ART', 'CARD', 'FM', 'ITJ', 'KOUI', 'KOUS', 'KON', 'KOKOM', 'NN', 'NE', 'PDS', 'PDAT', 'PIS', 'PIAT', 'PIDAT', 'PPER', 'PPOSS', 'PPOSAT', 'PRELS', 'PRELAT', 'PRF', 'PWS', 'PWAT', 'PWAV', 'PAV', 'PTKZU', 'PTKNEG', 'PTKVZ', 'PTKANT', 'PTKA', 'SGML', 'SPELL', 'TRUNC', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP', 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'VMFIN', 'VMINF', 'VMPP', 'XY', '$,', '$.', '$(']
False


## Restructure Data

In [6]:
train_data = pd.DataFrame(train_tagged_list, columns = ['word', 'tag'])
train_data = train_data.drop(train_data[(~train_data.tag.isin(STTS_tagset))].index)

#----Test Data-----------------------------------------------------------

test_data = pd.DataFrame(test_tagged_list, columns = ['word', 'tag'])
test_data = test_data.drop(test_data[(~test_data.tag.isin(STTS_tagset))].index)


In [7]:
train_data_list = train_data.values.tolist()
train_sentence_endings = []

for i in range(0,len(train_data_list)):
    if(train_data_list[i][1] == "$."):
        train_sentence_endings.append(i+1)
        
tagged_train_sentences = [train_data_list[i : j] for i, j in zip([0] + 
          train_sentence_endings, train_sentence_endings + [None])] 
print(tagged_train_sentences[0])

#----Test Data-----------------------------------------------------------

test_data_list = test_data.values.tolist()
test_sentence_endings = []

for i in range(0,len(test_data_list)):
    if(test_data_list[i][1] == "$."):
        test_sentence_endings.append(i+1)
        
tagged_test_sentences = [test_data_list[i : j] for i, j in zip([0] + 
          test_sentence_endings, test_sentence_endings + [None])] 
print(tagged_test_sentences[0])

[['``', '$('], ['Ross', 'NE'], ['Perot', 'NE'], ['waere', 'VAFIN'], ['vielleicht', 'ADV'], ['ein', 'ART'], ['praechtiger', 'ADJA'], ['Diktator', 'NN'], ["''", '$('], ['Konzernchefs', 'NN'], ['lehnen', 'VVFIN'], ['den', 'ART'], ['Milliardaer', 'NN'], ['als', 'APPR'], ['US-Praesidenten', 'NN'], ['ab', 'PTKVZ'], ['/', '$('], ['Texaner', 'NN'], ['gibt', 'VVFIN'], ['nur', 'ADV'], ['vage', 'ADJA'], ['Auskunft', 'NN'], ['ueber', 'APPR'], ['seine', 'PPOSAT'], ['Wirtschaftspolitik', 'NN'], ['Der', 'ART'], ['texanische', 'ADJA'], ['Milliardaer', 'NN'], ['Ross', 'NE'], ['Perot', 'NE'], ['hat', 'VAFIN'], ['das', 'ART'], ['politische', 'ADJA'], ['Establishment', 'NN'], ['in', 'APPR'], ['Washington', 'NE'], ['aufgeschreckt', 'VVPP'], ['.', '$.']]
[['BONN', 'NE'], [',', '$,'], ['10.', 'ADJA'], ['Maerz', 'NN'], ['(', '$('], ['dpa', 'NE'], [')', '$('], ['.', '$.']]


## Splitting tagged sentences to sentences and tags

In [8]:
train_sentences, train_tags = [],[]
for i in range(len(tagged_train_sentences)):
    list_sentences, list_tags = [],[]
    for k in range(len(tagged_train_sentences[i])):
        list_sentences.append(tagged_train_sentences[i][k][0])
        list_tags.append(tagged_train_sentences[i][k][1])
    train_sentences.append(list_sentences)
    train_tags.append(list_tags)
        
print(train_sentences[0])
print(train_tags[0])

#----Test Data-----------------------------------------------------------

test_sentences, test_tags = [],[]
for i in range(len(tagged_test_sentences)):
    list_sentences, list_tags = [],[]
    for k in range(len(tagged_test_sentences[i])):
        list_sentences.append(tagged_test_sentences[i][k][0])
        list_tags.append(tagged_test_sentences[i][k][1])
    test_sentences.append(list_sentences)
    test_tags.append(list_tags)
        
print(test_sentences[0])
print(test_tags[0])

['``', 'Ross', 'Perot', 'waere', 'vielleicht', 'ein', 'praechtiger', 'Diktator', "''", 'Konzernchefs', 'lehnen', 'den', 'Milliardaer', 'als', 'US-Praesidenten', 'ab', '/', 'Texaner', 'gibt', 'nur', 'vage', 'Auskunft', 'ueber', 'seine', 'Wirtschaftspolitik', 'Der', 'texanische', 'Milliardaer', 'Ross', 'Perot', 'hat', 'das', 'politische', 'Establishment', 'in', 'Washington', 'aufgeschreckt', '.']
['$(', 'NE', 'NE', 'VAFIN', 'ADV', 'ART', 'ADJA', 'NN', '$(', 'NN', 'VVFIN', 'ART', 'NN', 'APPR', 'NN', 'PTKVZ', '$(', 'NN', 'VVFIN', 'ADV', 'ADJA', 'NN', 'APPR', 'PPOSAT', 'NN', 'ART', 'ADJA', 'NN', 'NE', 'NE', 'VAFIN', 'ART', 'ADJA', 'NN', 'APPR', 'NE', 'VVPP', '$.']
['BONN', ',', '10.', 'Maerz', '(', 'dpa', ')', '.']
['NE', '$,', 'ADJA', 'NN', '$(', 'NE', '$(', '$.']


## Translating the STTS tags to Universal tags
This takes a while due to inefficient processing :)

In [9]:
for i in range(len(train_tags)):
    tags_dataframe = pd.DataFrame(train_tags[i], columns = ["tag"])
    tags_dataframe = tags_dataframe.replace({"tag": tagset_conversion})
    train_tags[i] = tags_dataframe.values.tolist()
    for k in range(len(train_tags[i])):
        train_tags[i][k] =  train_tags[i][k][0]
print(train_tags[0])

#----Test Data-----------------------------------------------------------

for i in range(len(test_tags)):
    tags_dataframe = pd.DataFrame(test_tags[i], columns = ["tag"])
    tags_dataframe = tags_dataframe.replace({"tag": tagset_conversion})
    test_tags[i] = tags_dataframe.values.tolist()
    for k in range(len(test_tags[i])):
        test_tags[i][k] =  test_tags[i][k][0]
print(test_tags[0])

['PUNCT', 'PROPN', 'PROPN', 'AUX', 'ADV', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PUNCT', 'NOUN', 'VERB', 'ADV', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'DET', 'ADJ', 'NOUN', 'PROPN', 'PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'VERB', 'PUNCT']
['PROPN', 'PUNCT', 'ADJ', 'NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'PUNCT']


In [10]:
(train_sentences, 
 dev_sentences, 
 train_tags, 
 dev_tags) = train_test_split(train_sentences, train_tags, test_size=0.2)

## Transforming tags and words to indexes

In [11]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())
        
for ts in train_tags:
    for t in ts:
        tags.add(t)
        
word2index = {w: i + 2 for i, w in enumerate(words)}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(tags)}
tag2index['-PAD-'] = 0  # The special value used to padding

In [12]:
train_sentences_X, dev_sentences_X, train_tags_y, dev_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
    
for s in dev_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    dev_sentences_X.append(s_int)
    
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
    
for s in dev_tags:
    dev_tags_y.append([tag2index[t] for t in s])
    
print(train_sentences_X[0])
print(dev_sentences_X[0])
print(train_tags_y[0])
print(dev_tags_y[0])

#----Test Data-----------------------------------------------------------

test_sentences_X, test_tags_y = [],[]

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
    
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

[3052, 25344, 17532, 32044, 25563, 7166, 31501, 1088, 5246, 7943, 16661, 47790, 48836, 49027]
[7943, 8137, 21444, 18326, 63087, 4398, 1, 5886, 2163, 40590, 57842, 56150, 18072, 56970, 29707, 21764, 36434, 45867, 4621, 25344, 48009, 29707, 37151, 31964, 4991, 1, 46388, 38124, 112, 40590, 32554, 4991, 33450, 25344, 5272, 30165, 17559, 40078, 36235, 49027]
[7, 14, 12, 4, 6, 7, 12, 12, 4, 7, 1, 1, 12, 7]
[7, 14, 15, 4, 6, 8, 8, 16, 7, 14, 4, 4, 14, 4, 11, 14, 4, 2, 4, 14, 9, 11, 4, 14, 15, 4, 15, 12, 4, 14, 4, 15, 4, 14, 12, 4, 7, 8, 7, 7]


## Padding the sequences

In [13]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)

130


In [14]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
dev_sentences_X = pad_sequences(dev_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
dev_tags_y = pad_sequences(dev_tags_y, maxlen=MAX_LENGTH, padding='post')

#----Test Data-----------------------------------------------------------

test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

In [15]:
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

## Network Architecture

In [16]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=[ignore_class_accuracy(0)])
 
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 128)          8196480   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 130, 512)          788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 130, 17)           8721      
_________________________________________________________________
activation_1 (Activation)    (None, 130, 17)           0         
Total params: 8,993,681
Trainable params: 8,993,681
Non-trainable params: 0
_________________________________________________________________


In [17]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [18]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


## Training the model
Using 5 epochs at approx. 80-90s time estimated per epoch

In [22]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=5, validation_split=0.2)

Train on 23196 samples, validate on 5799 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1bab2e91cc0>

## Scores on dev set

In [23]:
dev_scores = model.evaluate(dev_sentences_X, to_categorical(dev_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]} dev: {dev_scores[1] * 100}")

ignore_accuracy dev: 95.71502806761694


## Scores on test set

In [24]:
test_scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]} test: {test_scores[1] * 100}")

ignore_accuracy test: 94.76046433120463


## Scores on minitest
Auf dem minitest wurden folgende Werte erzielt:

acc test: 99.15750963347298

ignore_accuracy test: 94.50607793944664