# Trabalho Prático 2
## Processamento de Linguagem Natural - 2018/2

### Bernardo de Almeida Abreu - 2018718155

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import re
import gensim
import nltk
import keras
import matplotlib.pyplot as plt


Using TensorFlow backend.


In [2]:
paths = {
    'train': '../macmorpho-v3/macmorpho-train.txt',
    'test': '../macmorpho-v3/macmorpho-test.txt',
    'dev': '../macmorpho-v3/macmorpho-dev.txt',
    'word2vec': '../data/skip_s100.txt'
}

## Embedding - Word2Vec

In [3]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(paths['word2vec'])

In [36]:
w2v_model.similar_by_vector('hemocentro')

IndexError: index 929606 is out of bounds for axis 0 with size 929606

### Adiciona vetores extras

In [35]:
w2v_model.add(['<PAD>','<OOV>'], [[0.1]*100,[0.2]*100])

In [37]:
print('<PAD>' in w2v_model)

True


## Leitura do texto

In [6]:
def read_text(filename):
    with open(filename, 'r') as f:
        return f.readlines()

In [66]:
train_text = read_text(paths['train'])
test_text = read_text(paths['test'])
dev_text = read_text(paths['dev'])

print(train_text[0])
print(test_text[0])
print(dev_text[0])

Jersei_N atinge_V média_N de_PREP Cr$_CUR 1,4_NUM milhão_N na_PREP+ART venda_N da_PREP+ART Pinhal_NPROP em_PREP São_NPROP Paulo_NPROP ._PU

Salto_N sete_ADJ

Ainda_ADV em_PREP dezembro_N de_PREP 1990_N ,_PU foi_V editada_PCP a_ART famosa_ADJ 289_N ,_PU que_PRO-KS modificava_V a_ART sistemática_N da_PREP+ART arrecadação_N do_PREP+ART ITR_NPROP e_KC alterava_V suas_PROADJ alíquotas_N ._PU



### Separação de palavras e tags

In [65]:
def split_word_tags(text):
    word_lines = []
    tag_lines = []
    for line in text:
        words, tags = zip(*[tagged_word.split('_') for tagged_word in line.split()])
        word_lines.append([w.lower() if w.lower() in w2v_model.vocab else '<OOV>' for w in words])
        tag_lines.append(list(tags))
    return word_lines, tag_lines

In [67]:
train_words, train_tags = split_word_tags(train_text)
print(train_words[0])
print(train_tags[0])

test_words, test_tags = split_word_tags(test_text)
dev_words, dev_tags = split_word_tags(dev_text)


['jersei', 'atinge', 'média', 'de', 'cr$', '<OOV>', 'milhão', 'na', 'venda', 'da', 'pinhal', 'em', 'são', 'paulo', '.']
['N', 'V', 'N', 'PREP', 'CUR', 'NUM', 'N', 'PREP+ART', 'N', 'PREP+ART', 'NPROP', 'PREP', 'NPROP', 'NPROP', 'PU']


In [68]:
def flat_list(l):
    return [item for sublist in l for item in sublist]
len(set(flat_list(train_words)))

id2tag = list(set(flat_list(train_tags)).union(set(flat_list(test_tags))).union(set(flat_list(dev_tags))))
tag2id = {}
for i, tag in enumerate(all_tags):
    tag2id[tag] = i
print(tag2id)
print(id2tag)

{'PU': 0, '<PAD>': 1, 'PDEN': 2, 'KS': 3, 'ADJ': 4, 'ADV': 5, 'NUM': 6, 'PREP+PROPESS': 7, 'KC': 8, 'PREP+PRO-KS': 9, 'PREP+PROSUB': 10, 'ADV-KS': 11, 'IN': 12, 'CUR': 13, 'PCP': 14, 'PREP+PROADJ': 15, 'PROPESS': 16, 'V': 17, 'N': 18, 'PRO-KS': 19, 'PREP+ART': 20, 'ART': 21, 'PREP+ADV': 22, 'PROSUB': 23, 'NPROP': 24, 'PREP': 25, 'PROADJ': 26}
['PU', 'PDEN', 'KS', 'ADJ', 'ADV', 'NUM', 'PREP+PROPESS', 'KC', 'PREP+PRO-KS', 'PREP+PROSUB', 'ADV-KS', 'IN', 'CUR', 'PCP', 'PREP+PROADJ', 'PROPESS', 'V', 'N', 'PRO-KS', 'PREP+ART', 'ART', 'PREP+ADV', 'PROSUB', 'NPROP', 'PREP', 'PROADJ']


## Pad the words

### Analyse sentence size distribution

In [11]:
df_train = pd.DataFrame(columns=['words', 'tags'])
df_test = pd.DataFrame(columns=['words', 'tags'])
df_dev = pd.DataFrame(columns=['words', 'tags'])

df_train['words'] = train_words
df_train['tags'] = train_tags

df_test['words'] = test_words
df_test['tags'] = test_tags

df_dev['words'] = dev_words
df_dev['tags'] = dev_tags



df_sentences = pd.concat([df_train, df_test, df_dev], axis=0)

df_sentences.shape

(49932, 2)

In [12]:
df_sentences['words'].map(len).describe()

count    49932.000000
mean        18.940779
std         12.070051
min          1.000000
25%         10.000000
50%         17.000000
75%         25.000000
max        248.000000
Name: words, dtype: float64

In [38]:
df_sentences['words'].map(len).hist()
plt.show()

<IPython.core.display.Javascript object>

In [25]:
MAX_SENTENCE_LENGTH = int(df_sentences['words'].map(len).describe()['75%'])
MAX_SENTENCE_LENGTH

25

In [27]:
def fill_sentence(sentence):
    tokens_to_fill = int(MAX_SENTENCE_LENGTH - len(sentence))
    
#     sentence.append('<END>')
    sentence.extend(['<PAD>']*tokens_to_fill)
    
    return sentence[:MAX_SENTENCE_LENGTH]

In [28]:
df_train["words"] = df_train["words"].map(fill_sentence)
df_train["tags"] = df_train["tags"].map(fill_sentence)

df_test["words"] = df_test["words"].map(fill_sentence)
df_test["tags"] = df_test["tags"].map(fill_sentence)

df_dev["words"] = df_dev["words"].map(fill_sentence)
df_dev["tags"] = df_dev["tags"].map(fill_sentence)

In [29]:
df_test.head()

Unnamed: 0,words,tags
0,"[salto, sete, <PAD>, <PAD>, <PAD>, <PAD>, <PAD...","[N, ADJ, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <P..."
1,"[o, grande, assunto, da, semana, em, nova, yor...","[ART, ADJ, N, PREP+ART, N, PREP, NPROP, NPROP,..."
2,"[número, duplo, especial, ,, é, inteirinho, de...","[N, ADJ, ADJ, PU, V, ADJ, PCP, PREP, N, PREP, ..."
3,"[a, endiabrada, editora, tina, brown, ex, da, ...","[ART, PCP, N, NPROP, NPROP, N, PREP+ART, PU, N..."
4,"[além, das, fotos, de, richard, avedon, ., <PA...","[PREP, PREP+ART, N, PREP, NPROP, NPROP, PU, <P..."


In [43]:
df_dev['words'].map(len).describe()

count    1997.0
mean       25.0
std         0.0
min        25.0
25%        25.0
50%        25.0
75%        25.0
max        25.0
Name: words, dtype: float64

## Arquitetura do modelo

In [31]:
model = keras.models.Sequential()
model

<keras.engine.sequential.Sequential at 0x7f0ab4738ac8>

### Adiciona camada de embedding

In [39]:
print(len(w2v_model.vocab))
print(MAX_SENTENCE_LENGTH)
print(len(df_train))

929608
25
37948


In [46]:
pretrained_weights = w2v_model.vectors
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['modelo', 'rede', 'treino', 'aprendizado']:
    most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in w2v_model.most_similar(word)[:8])
    print('  %s -> %s' % (word, most_similar))

def word2idx(word):
    return w2v_model.vocab[word].index
def idx2word(idx):
    return w2v_model.index2word[idx]

print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(df_train), max_sentence_length], dtype=np.int32)
train_y = np.zeros([len(df_train)], dtype=np.int32)

for i, sentence in enumerate(df_train['words']):
    for t, word in enumerate(sentence):
        train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

Result embedding shape: (929608, 100)
Checking similar words:
  modelo -> protótipo (0.78), conceito (0.75), padrão (0.75), propulsor (0.74), paradigma (0.73), layout (0.73), prototipo (0.72), monovolume (0.72)
  rede -> emissora (0.73), repetidora (0.72), tv (0.72), retransmissora (0.70), operadora (0.70), rádio (0.70), televisão (0.67), transmundial (0.67)
  treino -> treinamento (0.75), técnico-tático (0.68), técnico/tático (0.67), treinos (0.67), voo (0.66), vôo (0.63), rachão (0.62), trenamento (0.60)
  aprendizado -> aprimoramento (0.77), auto-conhecimento (0.74), autoconhecimento (0.71), auto-aperfeiçoamento (0.70), refinamento (0.68), amadurecimento (0.68), autocontrole (0.67), letramento (0.66)

Preparing the data for LSTM...
train_x shape: (37948, 25)
train_y shape: (37948,)


In [45]:
# model.add(keras.layers.InputLayer(input_shape=(max_sentence_length, )))

In [47]:
model.add(
    keras.layers.Embedding(
        input_dim=len(w2v_model.vocab),
        output_dim=emdedding_size,
        input_length=max_sentence_length,
        weights=[pretrained_weights]
    )
)

In [48]:
model.add(keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True)))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(len(tag2index))))
model.add(keras.layers.Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()


NameError: name 'tag2index' is not defined