# Trabalho Prático 2
## Processamento de Linguagem Natural - 2018/2

### Bernardo de Almeida Abreu - 2018718155

## Introdução

A tarefa de realizar um *Part-of-speech tagging* (POS tagging) é uma tarefa clássica da área de processamento de linguagem natural. Ela consiste em assinalar uma classe gramatical para cada token de um texto [1]. Alguns exemplos de classes gramaticais que podem ser assinaladas aos tokens de um texto são "substantivo", "adjetivo" e "pontuação". Muitas vezes uma mesma palavra assume papéis e significados diferentes dependendo do contexto em que se encontra, de modo que essa tarefa não é trivial.

As melhores soluções para esse problema se baseiam em técnicas de aprendizado de máquina supervisionado e, por esse motivo, é necessário que exista uma base de dados anotada na língua correta.

## Implementação

O objetivo desse trabalho é implementar a tarefa de POS-tagging para uma base de dados em português. A base de dados utilizada 

In [None]:
%matplotlib notebook
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
import numpy as np
import pandas as pd
import gensim
import keras
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
paths = {
    'train': '../macmorpho-v3/macmorpho-train.txt',
    'test': '../macmorpho-v3/macmorpho-test.txt',
    'dev': '../macmorpho-v3/macmorpho-dev.txt',
    'word2vec': '../data/skip_s100.txt',
    'model': '../src/model.json',
    'model_weights': '../src/model.h5'
}

## Leitura do texto

In [None]:
def read_text(filename):
    with open(filename, 'r') as f:
        return f.readlines()

In [None]:
train_text = read_text(paths['train'])
test_text = read_text(paths['test'])
dev_text = read_text(paths['dev'])

print(train_text[0])
print(test_text[0])
print(dev_text[0])

### Separação de palavras e tags

In [None]:
def split_word_tags(text):
    word_lines = []
    tag_lines = []
    for line in text:
        words, tags = zip(*[tagged_word.split('_') for tagged_word in line.split()])
        word_lines.append([w.lower() for w in words])
        tag_lines.append(list(tags))
    return word_lines, tag_lines

def flat_list(l):
    return [item for sublist in l for item in sublist]

In [None]:
train_words, train_tags = split_word_tags(train_text)
print(train_words[0])
print(train_tags[0])

test_words, test_tags = split_word_tags(test_text)
dev_words, dev_tags = split_word_tags(dev_text)

In [None]:
id2tag = ['<PAD>'] + list(set(flat_list(train_tags)).union(set(flat_list(test_tags))).union(set(flat_list(dev_tags))))
tag2id = {}
for i, tag in enumerate(id2tag):
    tag2id[tag] = i
print(tag2id)
print(id2tag)

## Padding das sentenças

### Análise da distribuição  dos tamanhos das sentenças

É necessário que todas as sentenças do texto possuam o mesmo tamanho para que possam ser fornecidas de entrada para a rede neural.

In [None]:
df_train = pd.DataFrame(columns=['words', 'tags'])
df_test = pd.DataFrame(columns=['words', 'tags'])
df_dev = pd.DataFrame(columns=['words', 'tags'])

df_train['words'] = train_words
df_train['tags'] = train_tags

df_test['words'] = test_words
df_test['tags'] = test_tags

df_dev['words'] = dev_words
df_dev['tags'] = dev_tags

df_sentences = pd.concat([df_train, df_test, df_dev], axis=0)

In [None]:
df_sentences['words'].map(len).describe()

In [None]:
df_sentences['words'].map(len).hist()
plt.show()

In [None]:
MAX_SENTENCE_LENGTH = int(df_sentences['words'].map(len).describe()['75%'])
MAX_SENTENCE_LENGTH

In [None]:
def fill_sentence(sentence):
    tokens_to_fill = int(MAX_SENTENCE_LENGTH - len(sentence))
    
#     sentence.append('<END>')
    sentence.extend(['<PAD>']*tokens_to_fill)
    
    return sentence[:MAX_SENTENCE_LENGTH]

In [None]:
df_train["words"] = df_train["words"].map(fill_sentence)
df_train["tags"] = df_train["tags"].map(fill_sentence)

df_test["words"] = df_test["words"].map(fill_sentence)
df_test["tags"] = df_test["tags"].map(fill_sentence)

df_dev["words"] = df_dev["words"].map(fill_sentence)
df_dev["tags"] = df_dev["tags"].map(fill_sentence)

In [None]:
df_test.head()

In [None]:
df_dev['words'].map(len).describe()

## Embedding - Word2Vec

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(paths['word2vec'])

In [None]:
w2v_model.similar_by_vector('hemocentro')

### Adiciona vetores extras

In [None]:
w2v_model.add(['<PAD>','<OOV>'], [[0.1]*100,[0.2]*100])

In [None]:
print('<PAD>' in w2v_model)

In [None]:
print(len(w2v_model.vocab))
print(MAX_SENTENCE_LENGTH)
print(len(df_train))
w2v_model.vocab['<OOV>'].index
print(len(df_train['words']))

In [None]:
pretrained_weights = w2v_model.vectors
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

def word2idx(word):
    return w2v_model.vocab[word].index
def idx2word(idx):
    return w2v_model.index2word[idx]


def prepare_words(sentences):
    sentences_x = np.zeros([len(sentences), MAX_SENTENCE_LENGTH], dtype=np.int32)

    oov_index = word2idx('<OOV>')
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence):
            try:
                sentences_x[i, t] = word2idx(word)
            except KeyError:
                sentences_x[i, t] = oov_index
    return sentences_x

def prepare_tags(tag_sentences, tag2index):
    tags_y = np.zeros([len(tag_sentences), MAX_SENTENCE_LENGTH], dtype=np.int32)
    for i, sentence in enumerate(tag_sentences):
        for t, tag in enumerate(sentence):
            tags_y[i, t] = tag2index[tag]
    return tags_y


In [None]:
print('\nPreparing the train data for LSTM...')
train_sentences_X = prepare_words(df_train['words'])
print('train_x shape:', train_sentences_X.shape)

print('\nPreparing the test data for LSTM...')
test_sentences_X = prepare_words(df_test['words'])
print('test_x shape:', test_sentences_X.shape)

print('\nPreparing the validation data for LSTM...')
dev_sentences_X = prepare_words(df_dev['words'])
print('dev_x shape:', dev_sentences_X.shape)


print('\nPreparing the train tags for LSTM...')
train_tags_y = prepare_tags(df_train['tags'], tag2id)
print('train_y shape:', train_tags_y.shape)

print('\nPreparing the test data for LSTM...')
test_tags_y = prepare_tags(df_test['tags'], tag2id)
print('test_y shape:', test_tags_y.shape)

print('\nPreparing the validation data for LSTM...')
dev_tags_y = prepare_tags(df_dev['tags'], tag2id)
print('dev_y shape:', dev_tags_y.shape)

print()

print(train_sentences_X[0])
print(test_sentences_X[0])
print(dev_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])
print(dev_tags_y[0])

cat_train_tags_y = keras.utils.to_categorical(train_tags_y, num_classes=len(id2tag), dtype='int32')
cat_test_tags_y = keras.utils.to_categorical(test_tags_y, num_classes=len(id2tag), dtype='int32')
cat_dev_tags_y = keras.utils.to_categorical(dev_tags_y, num_classes=len(id2tag), dtype='int32')

# print(cat_train_tags_y[0])
print(cat_test_tags_y.shape)

## Arquitetura do modelo

In [None]:
model = keras.models.Sequential()
model

### Adiciona camada de embedding

In [None]:
model.add(
    keras.layers.Embedding(
        input_dim=len(w2v_model.vocab),
        output_dim=emdedding_size,
        input_length=MAX_SENTENCE_LENGTH,
        weights=[pretrained_weights]
    )
)

In [None]:
model.add(keras.layers.Bidirectional(keras.layers.LSTM( return_sequences=True)))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(len(tag2id))))
model.add(keras.layers.Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(0.001),
              metrics=['accuracy'])
 
model.summary()


In [None]:
csv_logger = keras.callbacks.CSVLogger('training.log')
model.fit(train_sentences_X, cat_train_tags_y, batch_size=128, epochs=40,
          validation_data=(dev_sentences_X,cat_dev_tags_y),
          callbacks=[csv_logger])

In [None]:
scores = model.evaluate(test_sentences_X, cat_test_tags_y)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 99.09751977804825

## Save model

### serialize model to JSON

In [None]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

### serialize weights to HDF5

In [None]:
model.save_weights("model.h5")
print("Saved model to disk")

## Load model

### load json and create model

In [None]:
json_file = open(paths['model'], 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

### load weights into new model

In [None]:
loaded_model.load_weights(paths['model_weights'])
print("Loaded model from disk")

### evaluate loaded model on test data

In [None]:
loaded_model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(0.001),
              metrics=['accuracy'])

In [None]:
score = loaded_model.evaluate(train_sentences_X, cat_train_tags_y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
train_sentences_X = np.loadtxt('../src/train_x')
train_tags_y = np.loadtxt('../src/train_y')
test_sentences_X = np.loadtxt('../src/test_x')
test_tags_y = np.loadtxt('../src/test_y')
dev_sentences_X = np.loadtxt('../src/dev_x')
dev_tags_y = np.loadtxt('../src/dev_y')

In [None]:
cat_train_tags_y = keras.utils.to_categorical(train_tags_y)
cat_test_tags_y = keras.utils.to_categorical(test_tags_y)
cat_dev_tags_y = keras.utils.to_categorical(dev_tags_y)

In [None]:
model = keras.models.load_model('../saved_model/keras_model.hdf5')

In [None]:
saver = tf.train.Saver()
sess = keras.backend.get_session()
saver.restore(sess, '../saved_model/keras_model')
model.summary()

In [None]:
print('Evaluating model:')
scores = model.evaluate(test_sentences_X, test_tags_y)
print("Test model %s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

scores = model.evaluate(train_sentences_X, train_tags_y)
print("Train model %s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

scores = model.evaluate(dev_sentences_X, dev_tags_y)
print("Dev model %s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

## Referências

1. https://arxiv.org/pdf/1508.01991.pdf
2. http://www.aclweb.org/anthology/Y/Y09/Y09-1013.pdf
3. https://nlpforhackers.io/lstm-pos-tagger-keras/