# Tutorial Part-of-Speech tagging  Con Deep Learning

### En este tutorial, veremos cómo puede usar un modelo simple en Keras, para entrenar y evaluar una red neuronal artificial  BLSTM para problemas de clasificación de múltiples clases.

## PARTE 1  -  Pre-Procesamiento

In [1]:
# Asegurar reproducibilidad

import numpy as np

CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

### Descargamos el Corpus Ancora - Cess_esp del nltk

In [2]:
import nltk
nltk.download('cess_esp')

from nltk.corpus import brown

nltk.download('brown')



[nltk_data] Downloading package cess_esp to /home/daniel/nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!
[nltk_data] Downloading package brown to /home/daniel/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

### Extraemos las oraciones tageadas del Corpus

In [3]:
import random
from nltk.corpus import cess_esp

tagged_sentences1 = brown.tagged_sents()#[:6030]
tagged_sentences = cess_esp.tagged_sents()
#print('a random sentence: \n-> {}'.format(random.choice(sentences)))

### Extraemos los datos de la cantidad de oraciones a ser usadas y un ejemplo de una oracion presente en el corpus

In [4]:
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(cess_esp.tagged_words()))

print(tagged_sentences1[0])
print("Tagged sentences: ", len(tagged_sentences1))
print("Tagged words:", len(brown.tagged_words()))

[('El', 'da0ms0'), ('grupo', 'ncms000'), ('estatal', 'aq0cs0'), ('Electricité_de_France', 'np00000'), ('-Fpa-', 'Fpa'), ('EDF', 'np00000'), ('-Fpt-', 'Fpt'), ('anunció', 'vmis3s0'), ('hoy', 'rg'), (',', 'Fc'), ('jueves', 'W'), (',', 'Fc'), ('la', 'da0fs0'), ('compra', 'ncfs000'), ('del', 'spcms'), ('51_por_ciento', 'Zp'), ('de', 'sps00'), ('la', 'da0fs0'), ('empresa', 'ncfs000'), ('mexicana', 'aq0fs0'), ('Electricidad_Águila_de_Altamira', 'np00000'), ('-Fpa-', 'Fpa'), ('EAA', 'np00000'), ('-Fpt-', 'Fpt'), (',', 'Fc'), ('creada', 'aq0fsp'), ('por', 'sps00'), ('el', 'da0ms0'), ('japonés', 'aq0ms0'), ('Mitsubishi_Corporation', 'np00000'), ('para', 'sps00'), ('poner_en_marcha', 'vmn0000'), ('una', 'di0fs0'), ('central', 'ncfs000'), ('de', 'sps00'), ('gas', 'ncms000'), ('de', 'sps00'), ('495', 'Z'), ('megavatios', 'ncmp000'), ('.', 'Fp')]
Tagged sentences:  6030
Tagged words: 192686
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VB

### Se procede a Dividir en una lista de Oraciones dividida en lista de palabras y cada palabra con un correspondiente tag en un alista diferente

In [5]:
import numpy as np
import pickle

sentences, tagss = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    tagss.append(np.array(tags))
    
with open("sentences.txt", "wb") as fp:
    pickle.dump(sentences, fp)

with open("tags.txt", "wb") as fp:
    pickle.dump(tagss, fp)


### Imprimimos una posicion de la lista como ejemplo

In [6]:
print(len(sentences))
print(sentences[6])
print(tagss[6])

6030
['La' 'alcaldesa' 'de' 'Málaga' 'y' 'cabeza' 'de' 'lista' 'del' 'PP' 'al'
 'Congreso' 'por' 'esta' 'provincia' ',' 'Celia_Villalobos' ',' 'pidió'
 'hoy' 'a' 'los' 'militantes' 'de' 'esta' 'formación' 'que' '*0*' 'sepan'
 '"' 'administrar' 'la' 'victoria' '"' ',' 'porque' '"' 'no' 'vale' 'la'
 'revancha' ',' 'el' 'insulto' 'o' 'el' 'ataque' ',' 'eso' 'es' 'para'
 'ellos' '"' '.']
['da0fs0' 'ncfs000' 'sps00' 'np0000l' 'cc' 'ncfs000' 'sps00' 'ncfs000'
 'spcms' 'np0000o' 'spcms' 'np0000o' 'sps00' 'dd0fs0' 'ncfs000' 'Fc'
 'np0000p' 'Fc' 'vmis3s0' 'rg' 'sps00' 'da0mp0' 'nccp000' 'sps00' 'dd0fs0'
 'ncfs000' 'cs' 'sn.e-SUJ' 'vmsp3p0' 'Fe' 'vmn0000' 'da0fs0' 'ncfs000'
 'Fe' 'Fc' 'cs' 'Fe' 'rn' 'vmip3s0' 'da0fs0' 'ncfs000' 'Fc' 'da0ms0'
 'ncms000' 'cc' 'da0ms0' 'ncms000' 'Fc' 'pd0ns000' 'vsip3s0' 'sps00'
 'pp3mp000' 'Fe' 'Fp']


### Dividimos el corpus de la siguiente manera, Utilizamos aproximadamente el 60% de las oraciones etiquetadas para el entrenamiento, el 20% como conjunto de validación y el 20% para evaluar nuestro modelo. Con esto se asegura que nunca  habrá oraciones repetidas.

In [7]:
from sklearn.model_selection import train_test_split
 
(training_sentences, 
 test_sentences, 
 training_tags, 
 test_tags) = train_test_split(sentences, tagss, test_size=0.2)

(train_sentences, 
 eval_sentences, 
 train_tags, 
 eval_tags) = train_test_split(training_sentences, training_tags, test_size=0.25)


with open("train_sentences.txt", "wb") as fp:
    pickle.dump(train_sentences, fp)

with open("eval_sentences.txt", "wb") as fp:
    pickle.dump(tagss, fp)

with open("test_sentences.txt", "wb") as fp:
    pickle.dump(eval_sentences, fp)

with open("train_tags.txt", "wb") as fp:
    pickle.dump(train_tags, fp)

with open("eval_tags.txt", "wb") as fp:
    pickle.dump(eval_tags, fp)

with open("test_tags.txt", "wb") as fp:
    pickle.dump(test_tags, fp)
    

### Imprimimos los tamaños de las listas que nos indicaran el tamaño de filas de las matrices con las que estaremos trabajando

In [8]:
print("training_sentences:" + str(len(training_sentences)))
print("train_sentences: " + str(len(train_sentences)))
print("test_sentences: " + str(len(test_sentences)))
print("eval_sentences: " + str(len(eval_sentences)) + "\n")

print(train_sentences[0])
print(test_sentences[0])
print(eval_sentences[0])

print("\ntraining_tags:" + str(len(training_sentences)))
print("train_tags: " + str(len(train_tags)))
print("test_tags: " + str(len(test_tags)))
print("eval_tags: " + str(len(eval_tags)) + "\n")

print(train_tags[0])
print(test_tags[0])
print(eval_tags[0])

training_sentences:4824
train_sentences: 3618
test_sentences: 1206
eval_sentences: 1206

['*' 'El' 'Madrid' 'precisa' 'que' 'el' 'Deportivo' 'gane' 'la' 'Liga' ','
 'porque' 'los' 'gallegos' 'no' 'son' 'considerados' 'unos' 'herederos'
 ',' 'sino' 'unos' 'entrometidos' 'que' 'se' 'supone' 'temporales' ','
 'que' 'pertenecen' 'a' 'la' 'actualidad' 'más' 'rabiosa' 'y' 'no' 'a'
 'la' 'historia' 'más' 'enrabietada' '.']
['El' 'técnico' 'barcelonista' 'ha' 'asegurado' 'que' 'la' 'visita' 'de'
 'Gaspart' 'ha' 'contribuido' 'a' '"' 'sumar' '"' ',' 'y' '*0*' 'ha'
 'argumentado' 'que' 'el' 'encuentro' 'con' 'el' 'presidente' 'significa'
 'que' 'en' 'el' 'Barcelona' '"' 'todos' 'van' 'en' 'la' 'misma'
 'dirección' '"' '.']
['Lo_suyo' ',' 'lo' 'de' 'las' 'ratas' ',' 'no' 'es' 'la' 'carroña' 'pura'
 'y' 'dura' 'sino' 'la' 'vida' 'regalada' ',' 'el' 'eterno' 'banquete'
 'de' 'sobras' 'y' 'residuos' ',' 'el' 'festín' 'organizado' 'a' 'la'
 'sobra' 'de' 'la' 'abundancia' 'y' 'el' 'hartazgo' '.']

tra

### Ahora creamos una array con todas las palabras y los tags presentes en el corpus, adicionalmente se crea un diccionario que contiene las palabras unicas y los tags unicos de tal forma que no se repitan y que contienen un indice o llave

In [9]:
import numpy as np

words, tagsss = set([]), set([])
 
for s in (train_sentences + eval_sentences + test_sentences):
    for w in s:
        words.add(w.lower())

for ts in (train_tags + eval_tags + test_tags):
    for t in ts:
        tagsss.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 2 for i, t in enumerate(list(tagsss))}
tag2index['-PAD-'] = 0  # The special value used to padding
tag2index['-OOV-'] = 1  # The special value used to padding

print (len(word2index))
print (len(tag2index))

np.save('word2index.npy', word2index)
np.save('tag2index.npy', tag2index)

24499
291


### Ahora procedemos a transformar cada uno de los conjuntos de oraciones y tags en vectores numericos, modificando la palabra o tag en un Valor numerico que corresponde a una llave en el diccionario de palabras o tags

In [10]:
train_sentences_X, eval_sentences_X, test_sentences_X, train_tags_y, eval_tags_y, test_tags_y = [], [], [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)

for s in eval_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    eval_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

for s in train_tags:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    train_tags_y.append(s_int)

for s in eval_tags:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    eval_tags_y.append(s_int)

for s in test_tags:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    test_tags_y.append(s_int)


### Se imprime la longitud de las matrices y una muestra de cada una de las matrices creadas

In [11]:
print("Longitudes de las Matrices:")
print(len(train_sentences_X))
print(len(eval_sentences_X))
print(len(test_sentences_X))
print(len(train_tags_y))
print(len(eval_tags_y))
print(len(test_tags_y))

print("\nMuestra de Datos presentes en las Matrices con las transformaciones:\n")

print(train_sentences_X[0])
print(eval_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(eval_tags_y[0])
print(test_tags_y[0])

Longitudes de las Matrices:
3618
1206
1206
3618
1206
1206

Muestra de Datos presentes en las Matrices con las transformaciones:

[10717, 19905, 17168, 14314, 6299, 19905, 42, 10497, 8116, 23263, 11555, 7762, 4562, 7609, 22154, 14754, 12636, 9076, 15585, 11555, 22375, 9076, 5187, 6299, 881, 6919, 23979, 11555, 6299, 14141, 4120, 8116, 13902, 9639, 427, 14263, 22154, 4120, 8116, 11579, 9639, 21769, 239]
[16211, 11555, 1828, 14251, 16815, 6244, 11555, 22154, 11342, 8116, 6106, 20916, 14263, 15977, 22375, 8116, 9544, 8545, 11555, 19905, 19146, 664, 14251, 21977, 14263, 6308, 11555, 19905, 4569, 18814, 4120, 8116, 6298, 14251, 8116, 14196, 14263, 19905, 19098, 239]
[19905, 6032, 7161, 19717, 22553, 6299, 8116, 5859, 14251, 10876, 19717, 21111, 4120, 15330, 16891, 15330, 11555, 14263, 8139, 19717, 7101, 6299, 19905, 1305, 8235, 19905, 15731, 6414, 6299, 8835, 19905, 22788, 15330, 250, 11452, 8835, 8116, 1463, 13313, 15330, 239]
[31, 153, 93, 91, 86, 153, 95, 276, 238, 242, 76, 86, 278, 73, 1

### Se calcula cual es la oracion que mayor cantidad de Palabras contiene

In [12]:
MAX_LENGTH1 = len(max(train_sentences_X, key=len))
MAX_LENGTH2 = len(max(eval_sentences_X, key=len))
MAX_LENGTH3 = len(max(test_sentences_X, key=len))

l = [MAX_LENGTH1, MAX_LENGTH2, MAX_LENGTH3]
MAX_LENGTH = max(l)

print(MAX_LENGTH)

149


### Se procede a Normalizar las matrices para que todas contengan el mismo numero de columans, con la longitud maxima de palabras encontradas anteriormente, esto se logra agregando ceros a la derecha en las posiciones que hacen falta en el vector

In [13]:
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
eval_sentences_X = pad_sequences(eval_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
eval_tags_y = pad_sequences(eval_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
#print(eval_sentences_X[0])
#print(test_sentences_X[0])
print(train_tags_y[0])
print(eval_tags_y[0])
print(test_tags_y[0])

np.save('train_sentences_X.npy', train_sentences_X)
np.save('eval_sentences_X.npy', eval_sentences_X)
np.save('test_sentences_X.npy', test_sentences_X)
np.save('train_tags_y.npy', train_tags_y)
np.save('eval_tags_y.npy', eval_tags_y)
np.save('test_tags_y.npy', test_tags_y)

Using TensorFlow backend.


[10717 19905 17168 14314  6299 19905    42 10497  8116 23263 11555  7762
  4562  7609 22154 14754 12636  9076 15585 11555 22375  9076  5187  6299
   881  6919 23979 11555  6299 14141  4120  8116 13902  9639   427 14263
 22154  4120  8116 11579  9639 21769   239     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]
[ 31 153  93  91  86 153  95 276 238 242  76  86 278  73 125  96 250  97
  73  76 162  97  7

### Definimos la funcion con la cual categorizaremos los tags y los covertiremos un vector One-hot

In [14]:
def to_categoricals(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [15]:
import numpy as np
from keras.utils import to_categorical

def encode(data):
    print('Shape of data (BEFORE encode): %s' % str(data.shape))
    encoded = to_categorical(data)
    print('Shape of data (AFTER  encode): %s\n' % str(encoded.shape))
    return encoded

### Desarrollamos una prueba de la categorización de los tags

In [18]:
cat_train_tags_y = to_categoricals(train_tags_y, len(tag2index))
cat_eval_tags_y  = to_categoricals(eval_tags_y, len(tag2index))
cat_test_tags_y  = to_categoricals(test_tags_y, len(tag2index))

print(cat_train_tags_y[0])
print(len(cat_train_tags_y))
print(len(cat_test_tags_y))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
3618
1206


In [19]:
#FEATURES DEL SEGUNDO MODELO
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 

In [20]:
#DIVISION DEL SEGUNDO CORPUS
from nltk.tag.util import untag


cutoff1 = int(.001 * len(tagged_sentences1))
training_sentences1 = tagged_sentences1[:cutoff1]
test_sentences1 = tagged_sentences1[cutoff1:]
 
def transform_to_dataset(tagged_sentences1):
    X, y = [], []
 
    for tagged in tagged_sentences1:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

X_train1, y_train1 = transform_to_dataset(training_sentences1)
X_test1, y_test1 = transform_to_dataset(test_sentences1)
 
print(len(X_train1))     
print(len(X_test1))         
print(X_train1[0])
print(y_train1[0])


57
57283
[{'word': 'The', 'is_first': True, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'T', 'prefix-2': 'Th', 'prefix-3': 'The', 'suffix-1': 'e', 'suffix-2': 'he', 'suffix-3': 'The', 'prev_word': '', 'next_word': 'Fulton', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'Fulton', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'F', 'prefix-2': 'Fu', 'prefix-3': 'Ful', 'suffix-1': 'n', 'suffix-2': 'on', 'suffix-3': 'ton', 'prev_word': 'The', 'next_word': 'County', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'County', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'C', 'prefix-2': 'Co', 'prefix-3': 'Cou', 'suffix-1': 'y', 'suffix-2': 'ty', 'suffix-3': 'nty', 'prev_word': 'Fulton', 'next_word': 'Grand', 'has_hyphen': False, 'is_nume

In [21]:
pip install sklearn-crfsuite

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## PARTE 2  -  Entrenamiento

In [22]:
## Funcion que permite forzar el uso de GPU cuando estan presentes

import tensorflow as tf
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

import tensorflow as tf
strategy = tf.distribute.MirroredStrategy()

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6980955582265169704
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 3991473579146994551
physical_device_desc: "device: XLA_CPU device"
]
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


### Definimos el Modelo Base con el cual se procedera a desarrollar la fase de Entrenamiento

El modelo base tiene como entrada una oración de límite máximo de 149, la matriz de embedding  que inicia con el entrenamiento de la red se comporta como una matriz tridimensional de 24500 X 149 X 128, los 128 porque esa fue la ventana de entrenamiento definida. En cada EPOCH  la matriz de embedding se llena con los pesos que se propagan después de calculado el error. Viene una capa que se considera oculta, aplicando la función de máximos, RELU. Después viene la distribución de probabilidad  asignada a los tags que se asignan a la oración de entrada de las 3618 del corpus de entrenamiento. Finalmente, la salida de la distibución se discretiza o se comprime con la función softmax de manera que permita que los valores de distibución quden en el rango [0,1]. la función de pérdida (categorical_crossentropy) se calcula la actualización de los pesos.

In [23]:

!pip3 install git+https://www.github.com/keras-team/keras-contrib.git
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF

input = Input(shape=(MAX_LENGTH,))
word_embedding_size = 128

# Embedding Layer
model = Embedding(input_dim=len(word2index), output_dim=word_embedding_size, input_length=MAX_LENGTH)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(len(tag2index), activation="relu"))(model)  

# CRF Layer
crf = CRF(len(tag2index))

out = crf(model)  # output
model = Model(input, out)


#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])
model.summary()

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-9pt02igz
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-9pt02igz
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25ldone
[?25h  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-py3-none-any.whl size=101065 sha256=ae5395cbc35078b740f2dba31ad103fc47c26371f05c9f80efb5eafbfae3dd37
  Stored in directory: /tmp/pip-ephem-wheel-cache-xww75sal/wheels/bb/1f/f2/b57495012683b6b20bbae94a3915ec79753111452d79886abc
Successfully built keras-contrib
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m




Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 149)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 149, 128)          3135872   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 149, 256)          263168    
_________________________________________________________________
lstm_2 (LSTM)                (None, 149, 256)          525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 149, 291)          74787     
_________________________________________________________________
crf_1 (CRF)                  (None, 149, 291)          170235    
Total params: 4,169,374
Trainable params: 4,169,374
Non-trainable params: 0
_________________________________________________

### Se desarrolla el entrenamiento del modelo, este es el alimentador de la red neuronal, aquíne

In [24]:
#sudo pip install h5py

import os
model_hist = model.fit(train_sentences_X, cat_train_tags_y, batch_size=128, epochs=20, 
          validation_data=(eval_sentences_X, cat_eval_tags_y))

# serialize model to JSON
model_json = model.to_json()
with open("mb-00.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("mb-00.h5")
print("Saved model to disk")



'\nimport os\nmodel_hist = model.fit(train_sentences_X, cat_train_tags_y, batch_size=128, epochs=20, \n          validation_data=(eval_sentences_X, cat_eval_tags_y))\n\n# serialize model to JSON\nmodel_json = model.to_json()\nwith open("mb-00.json", "w") as json_file:\n    json_file.write(model_json)\n\n# serialize weights to HDF5\nmodel.save_weights("mb-00.h5")\nprint("Saved model to disk")\n\n'

Se carga el modelo que se entreno en el paso anterior

In [25]:
"""
from keras.models import model_from_json
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy


custom_objects={'CRF': CRF,'crf_loss': crf.loss_function,'crf_viterbi_accuracy':crf.accuracy}
# load json and create model
json_file = open('mb-00.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json, custom_objects = custom_objects)
# load weights into new model
model.load_weights("mb-00.h5")
print("Loaded model from disk")
 
# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])
model.summary()
"""

Loaded model from disk
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 149)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 149, 128)          3135872   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 149, 256)          263168    
_________________________________________________________________
lstm_2 (LSTM)                (None, 149, 256)          525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 149, 291)          74787     
_________________________________________________________________
crf_1 (CRF)                  (None, 149, 291)          170235    
Total params: 4,169,374
Trainable params: 4,169,374
Non-trainable params: 0
__________________________

In [26]:
#DEFINICION DEL SEGUNDO MODELO
from sklearn_crfsuite import CRF

model1 = CRF()

In [27]:
#ENTRENAMIENTO DEL SEGUNDO MODELO
model1.fit(X_train1, y_train1)



CRF(keep_tempfiles=None)

In [28]:
#ENTRENAMIENTO DEL SEGUNDO MODELO
def pos_tag1(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, model1.predict([sentence_features])[0]))

## PARTE 3  -  Evaluación de los Modelos

In [29]:
#EVALUACION DEL SEGUNDO MODELO
from sklearn_crfsuite import metrics
 
y_pred1 = model1.predict(X_test1)
print(metrics.flat_accuracy_score(y_test1, y_pred1))

0.6912120976989197


### Evaluamos el modelo y calculamos el valor de precision con respecto a los datos de prueba

In [30]:
scores = model.evaluate(test_sentences_X, cat_test_tags_y)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 97.66269326210022

crf_viterbi_accuracy: 77.75596380233765


### Definimos la funcion que nos servira para graficar el comportamiento del modelo en cada epoca del entrenamiento

In [31]:
import matplotlib.pyplot as plt

def plot_model_performance(train_loss, train_acc, train_val_loss, train_val_acc):
    """ Plot model loss and accuracy through epochs. """
    blue= '#34495E'
    green = '#2ECC71'
    orange = '#E23B13'
    
    # plot model loss
    fig, (ax1, ax2) = plt.subplots(2, figsize=(10, 8))
    ax1.plot(range(1, len(train_loss) + 1), train_loss, blue, linewidth=5, label='training')
    ax1.plot(range(1, len(train_val_loss) + 1), train_val_loss, green, linewidth=5, label='validation')
    ax1.set_xlabel('# epoch')
    ax1.set_ylabel('loss')
    ax1.tick_params('y')
    ax1.legend(loc='upper right', shadow=False)
    ax1.set_title('Model loss through #epochs', color=orange, fontweight='bold')
    
    # plot model accuracy
    ax2.plot(range(1, len(train_acc) + 1), train_acc, blue, linewidth=5, label='training')
    ax2.plot(range(1, len(train_val_acc) + 1), train_val_acc, green, linewidth=5, label='validation')
    ax2.set_xlabel('# epoch')
    ax2.set_ylabel('accuracy')
    ax2.tick_params('y')
    ax2.legend(loc='lower right', shadow=False)
    ax2.set_title('Model accuracy through #epochs', color=orange, fontweight='bold')
    
    fig.savefig('training-mb-00.png', bbox_inches='tight')

### Procedemos a Graficar el comportamiento del Entrenamiento, tanto del conjunto de entrenamiento como el de validación con respecto a la cantidad de epocas

In [32]:

plot_model_performance(
    train_loss=model_hist.history.get('loss', []),
    train_acc=model_hist.history.get('acc', []),
    train_val_loss=model_hist.history.get('val_loss', []),
    train_val_acc=model_hist.history.get('val_acc', [])
)


"\nplot_model_performance(\n    train_loss=model_hist.history.get('loss', []),\n    train_acc=model_hist.history.get('acc', []),\n    train_val_loss=model_hist.history.get('val_loss', []),\n    train_val_acc=model_hist.history.get('val_acc', [])\n)\n"

### Función que Permite convertir Indices en Tags

In [33]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

### Hacemos la prediccion sobre el conjunto de pruebas. De la distribución probabilítica a etiquetas. 

In [34]:
import pandas as pd

prediction = model.predict(test_sentences_X)
log_tokens = logits_to_tokens(prediction, {i: t for t, i in tag2index.items()})

print(log_tokens[0])

['vaip2s0', 'aq0ms0', 'sn.e-SUJ', 'sn.e-CD', 'pi0ms000', 'vmp00sf', 'sn.e-CD', 'sn.e-CD', 'sn.e-CD', 'sn.e-CD', 'sn.e-CD', 'vmp00sf', 'dd0cs0', 'vsn0000', 'vmif3p0', 'vmif3p0', 'vmif3p0', 'pt0mp000', 'sn.e-CD', 'sn.e-CD', 'vmif3p0', 'pp3mpa00', 'vmp00sf', 'sn.e-CD', 'nccp000', 'vmp00sf', 'aq0ms0', 'dp1css', 'vmii2s0', 'vmif3p0', 'vaip2s0', 'vmic3s0', 'vmif3p0', 'pp3mpa00', 'vmii2s0', 'vmif3p0', 'sn.e-CD', 'sn.e-CD', 'dd0cs0', 'vmif3p0', 'sn.e-SUJ', 'pp3fsa00', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PA

### Hallamos los valores de F1 score, recall, precision

In [35]:
from sklearn.metrics import classification_report, confusion_matrix

results = pd.DataFrame(columns=['Expected', 'Predicted'])
k = 0
for i, lista_etiquetas_oracion in enumerate(test_tags):
    for j, etiquetas in enumerate(lista_etiquetas_oracion):
        k = k + 1
        results.loc[k, 'Expected'] = etiquetas
        results.loc[k, 'Predicted'] = log_tokens[i][j]

# print(results)


print('\nclassification_report:\n', classification_report(results['Expected'], results['Predicted']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



classification_report:
               precision    recall  f1-score   support

         Faa       0.00      0.00      0.00         2
         Fat       0.00      0.00      0.00         5
          Fc       0.00      0.00      0.00      2291
          Fd       0.00      0.00      0.00        87
          Fe       0.00      0.00      0.00       631
          Fg       0.00      0.00      0.00       226
          Fh       0.00      0.00      0.00         3
         Fia       0.00      0.00      0.00         6
         Fit       0.00      0.00      0.00        19
          Fp       0.00      0.00      0.00      1178
         Fpa       0.00      0.00      0.00       156
         Fpt       0.00      0.00      0.00       160
          Fs       0.00      0.00      0.00        13
          Fx       0.00      0.00      0.00        41
          Fz       0.00      0.00      0.00         2
           W       0.00      0.00      0.00       194
           Z       0.00      0.00      0.00       320
  

## PARTE 4  -  Testing


### Se crea una funcion que convierte el texto en una entrada para el Modelo, se genera vectores de enteros de la oracion y  ejecuta la predicion con la Entrada del modelo entrenado y el modelo de la red neuronal predice un matriz de 149 X 291 por cada oración. El shape de a predicción es (2, 149,291)

In [36]:
def postagFun(test_sample):
    test_samples_X = []
    for s in test_sample:
        s_int = []
        for w in s:
            try:
                s_int.append(word2index[w.lower()])
            except KeyError:
                s_int.append(word2index['-OOV-'])
        test_samples_X.append(s_int)

    test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
    
    
    predictions = model.predict(test_samples_X)
    #print(predictions, predictions.shape)
    
    log_tokens = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
    return log_tokens

### Presentación de los Resultados

In [37]:
#######################MODULES######################
import sentencepiece as spm

from tkinter import *
from tkinter.ttk import *

#######################MODELS#######################
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --user_defined_symbols=<sep>,<cls> --vocab_size=2000')

spm.SentencePieceTrainer.train('--input=tweets_clean.txt --model_prefix=m_word --model_type=word --user_defined_symbols=<sep>,<cls> --vocab_size=2000')

True

In [42]:
###################################################
def tokenizer(post=False):
    select = combo.get()
    text = inputText.get("1.0",END)
    sp = spm.SentencePieceProcessor()  
    language = ''

    if(select=="Palabras (Español)"):
        sp.load('m_word.model')
        language = 'es'
    else:
        sp.load('m.model')
        
        sp.piece_to_id('<sep>')
        sp.piece_to_id('<cls>')
        language = 'en'

        
    res = sp.encode_as_pieces(text)
    
    if(post):return res,language
    else:
        res_postaggin=""

        for i in range(0,len(res)):
            res_postaggin = res_postaggin + res[i] + "\n"

        outputText.configure(state='normal')
        outputText.delete('1.0', END)
        outputText.insert("insert", res_postaggin)
        outputText.configure(state='disabled')

###################################################
def postagging():
        
    res,language = tokenizer(post=True)
    res1 = []    
        
    for i in range (0,len(res)):
        if(res[i][0]=='▁' and len(res[i])>1):res1.append(res[i][1:])
        else: res1.append(res[i])
        
    if(language=='es'):
        postag = postagFun([res1])
    else: postag = pos_tag1(res1)
    res_postaggin=""
    
    
    for i in range(0,len(res)):
        if(language=='es'):res_postaggin = res_postaggin + res1[i] +" | "+ postag[0][i] + "\n"
        else: res_postaggin = res_postaggin + postag[i][0] +" | "+ postag[i][1] + "\n"
    
    outputText.configure(state='normal')
    outputText.delete('1.0', END)
    outputText.insert("insert", res_postaggin)
    outputText.configure(state='disabled')


In [44]:
#######################GUI#########################
window = Tk()

window.configure(background='black')

window.title("Procesamiento del Lenguaje Natural")

window.geometry('500x500')

label= Label( text = 'Postagging-Procesamiento del Lenguaje Natural',  background = "white",
            font = "Helvetica 16 bold italic")
label.grid(column=0, row=0,pady=(30, 10))

btn = Button(window, text="Tokenizer",command=tokenizer)
btn1 = Button(window, text="Postagging",command=postagging)
btn.grid(column=0, row=3,pady=(10, 10),padx=(0,130))
btn1.grid(column=0, row=3,padx=(130,0))

combo = Combobox(window)
combo['values']= ("Personalizado (Inglés) ","Palabras (Español)")
combo.current(1) #set the selected item
combo.grid(column=0, row=1)

inputText = Text(window,height=9, width=61)
inputText.grid(column=0, row=5,padx=(2,0))

outputText = Text(window,height=10, width=61, state='disabled')
outputText.grid(column=0, row=6,padx=(2,0))

window.mainloop()

[['aq0ms0', 'vmp00sf', 'vmii2s0', 'pp3fsa00', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 