# Neural Network for Spanish Named Entity Recognition  

Jupyter Notebook based on: **Kamal Raj** NER with Bidirectional LSTM-CNNs implementation available on Github. https://github.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs.


**Versión: -v_2.2-**

Notas de version:
    
    
    - Se entrena ulilizando el set milenio noticias.
   
 

Entrenamiento realizado en:

    DESKTOP-0UQLV13
    Processor: Intel Core i7-6700HQ CPU 2.6GHz 
    RAM: 16GB
    OS: Windows 10 Home Single x64
    Tipo de memoria: SSD

    

Requiere:

    unidecode
    numpy (pip install --upgrade numpy)
    nltk (pip install --upgrade nltk)
    * Descargar nltk punkt y nltk stopwords:
    * >> import nltk 
    * >> nltk.download('stopwords')
    * >> nltk.download('punkt')
    * Para más información: https://www.nltk.org/data.html 
    random
    tensorflow 1.13.1 (pip install --upgrade tensorflow) *Actualmente (10/abril/19) no funciona con python 3.7.
    * Para más información: https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class01_intro_python.ipynb 
    keras (pip install --upgrade keras) 








NER task can be formulated as: 

_Given a sequence of tokens (words, and may be punctuation symbols) provide a tag from predefined set of tags for each token in the sequence._

For NER task there are some common types of entities which essentially are tags:
- Persons
- Locations
- Organizations
- Expressions of time
- Quantities
- Monetary values 

Furthermore, to distinguish consequent entities with the same tags BIO tagging scheme is used. "B" stands for beginning, 
"I" stands for the continuation of an entity and "O" means the absence of entity. Example with dropped punctuation:

    Bernhard        B-PER
    Riemann         I-PER
    Carl            B-PER
    Friedrich       I-PER
    Gauss           I-PER
    and             O
    Leonhard        B-PER
    Euler           I-PER

In the example above PER means person tag, and "B-" and "I-" are prefixes identifying beginnings and continuations of the entities. Without such prefixes, it is impossible to separate Bernhard Riemann from Carl Friedrich Gauss.




In [1]:
# Np for math 
# Keras for models, layers 4 NN layers 
import numpy as np
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from keras.utils import Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import unidecode
import string 

Using TensorFlow backend.


In [2]:
# Read file (txt) and divide the sentences into character bins (word, tag).
def readfile(filename):
    '''
    read file
    return format :
    [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
    '''
    f = open(filename, encoding='utf-8-sig') # open the file. Update to fix 'ï»¿'
    sentences = []
    sentence = []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:     
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split('\t')
        #splits[0] = unidecode.unidecode(splits[0]) # Remove special characters from spanish
        #splits[0] = splits[0].lower() # Lowercase the words 
        #splits[0] = splits[0].translate(str.maketrans('', '', string.punctuation)) # remove puntuation 
        splits[-1] = splits[-1].replace('\n', '').replace('\r', '') #Remove all line breaks from a long string of text
        if splits[0] != '':
            sentence.append([splits[0],splits[-1]])

    if len(sentence) >0: 
        sentences.append(sentence)
        sentence = []
    return sentences

In [3]:
# Read the 3 sets ************************************************* PATH ************************************************
# Dataset Milenio noticias, basado en el repositorio de noticias, disponible en el share drive de Rich It
trainSentences = readfile("tidy_data/milenio_train.txt")
devSentences = readfile("tidy_data/milenio_dev.txt")
testSentences = readfile("tidy_data/milenio_test.txt")

In [4]:
#print(len(devSentences))

In [5]:
#devSentences[0]

In [6]:
#devSentences[0]

In [7]:
#testSentences[0]

In [4]:
# Create new attribute in the character bins for padding
def addCharInformatioin(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars,data[1]]
    return Sentences

In [5]:
trainSentences = addCharInformatioin(trainSentences)
devSentences = addCharInformatioin(devSentences)
testSentences = addCharInformatioin(testSentences)

In [6]:
trainSentences[0]

[['Durante', ['D', 'u', 'r', 'a', 'n', 't', 'e'], 'O'],
 ['la', ['l', 'a'], 'O'],
 ['jornada', ['j', 'o', 'r', 'n', 'a', 'd', 'a'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['la', ['l', 'a'], 'O'],
 ['Liga', ['L', 'i', 'g', 'a'], 'B-MISC'],
 ['MX', ['M', 'X'], 'I-MISC'],
 [',', [','], 'O'],
 ['León', ['L', 'e', 'ó', 'n'], 'B-PER'],
 ['goleó', ['g', 'o', 'l', 'e', 'ó'], 'O'],
 ['3', ['3'], 'O'],
 ['-', ['-'], 'O'],
 ['0', ['0'], 'O'],
 ['a', ['a'], 'O'],
 ['Dorados', ['D', 'o', 'r', 'a', 'd', 'o', 's'], 'B-PER'],
 ['de', ['d', 'e'], 'I-PER'],
 ['Sinaloa', ['S', 'i', 'n', 'a', 'l', 'o', 'a'], 'I-PER'],
 ['.', ['.'], 'O'],
 ['Con', ['C', 'o', 'n'], 'O'],
 ['doblete', ['d', 'o', 'b', 'l', 'e', 't', 'e'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['y', ['y'], 'O'],
 ['un', ['u', 'n'], 'O'],
 ['tanto', ['t', 'a', 'n', 't', 'o'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['Carlos', ['C', 'a', 'r', 'l', 'o', 's'], 'B-PER'],
 ['Peña', ['P', 'e', 'ñ', 'a'], 'I-PER'],
 ['los', ['l', 'o', 's'], 'O'],
 ['felinos', ['f', 'e'

In [7]:
devSentences[0]

[['Mauro', ['M', 'a', 'u', 'r', 'o'], 'B-PER'],
 ['llegaron', ['l', 'l', 'e', 'g', 'a', 'r', 'o', 'n'], 'O'],
 ['el', ['e', 'l'], 'O'],
 ['.', ['.'], 'O'],
 ['el', ['e', 'l'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['el', ['e', 'l'], 'O'],
 ['en', ['e', 'n'], 'O'],
 [',', [','], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['mandarlo', ['m', 'a', 'n', 'd', 'a', 'r', 'l', 'o'], 'O'],
 ['Luis', ['L', 'u', 'i', 's'], 'B-PER'],
 ['local', ['l', 'o', 'c', 'a', 'l'], 'O'],
 ['el', ['e', 'l'], 'O'],
 ['el', ['e', 'l'], 'O'],
 ['esférico', ['e', 's', 'f', 'é', 'r', 'i', 'c', 'o'], 'O'],
 ['Minutos', ['M', 'i', 'n', 'u', 't', 'o', 's'], 'O'],
 ['que', ['q', 'u', 'e'], 'O'],
 ['un', ['u', 'n'], 'O'],
 ['cobertura', ['c', 'o', 'b', 'e', 'r', 't', 'u', 'r', 'a'], 'O'],
 ['parte', ['p', 'a', 'r', 't', 'e'], 'O'],
 ['.', ['.'], 'O'],
 ['tiempo', ['t', 'i', 'e', 'm', 'p', 'o'], 'O'],
 ['notar', ['n', 'o', 't', 'a', 'r'], 'O'],
 ['culichis', ['c', 'u', 'l', 'i', 'c', 'h', 'i', 's'], 'O'],
 ['la', ['l', 'a'], 'O'],
 ['e

In [8]:
testSentences[0]

[['última', ['ú', 'l', 't', 'i', 'm', 'a'], 'O'],
 ['Boselli', ['B', 'o', 's', 'e', 'l', 'l', 'i'], 'I-PER'],
 ['momento', ['m', 'o', 'm', 'e', 'n', 't', 'o'], 'O'],
 [',', [','], 'O'],
 ['solo', ['s', 'o', 'l', 'o'], 'O'],
 ['fondo', ['f', 'o', 'n', 'd', 'o'], 'O'],
 [',', [','], 'O'],
 [',', [','], 'O'],
 ['con', ['c', 'o', 'n'], 'O'],
 ['ya', ['y', 'a'], 'O'],
 [',', [','], 'O'],
 ['haría', ['h', 'a', 'r', 'í', 'a'], 'O'],
 ['para', ['p', 'a', 'r', 'a'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['la', ['l', 'a'], 'O'],
 ['para', ['p', 'a', 'r', 'a'], 'O'],
 ['Antonio', ['A', 'n', 't', 'o', 'n', 'i', 'o'], 'I-PER'],
 ['.', ['.'], 'O'],
 ['su', ['s', 'u'], 'O'],
 ['pues', ['p', 'u', 'e', 's'], 'O'],
 ['los', ['l', 'o', 's'], 'O'],
 ['podían', ['p', 'o', 'd', 'í', 'a', 'n'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['área', ['á', 'r', 'e', 'a'], 'O'],
 ['por', ['p', 'o', 'r'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['Boselli', ['B', 'o', 's', 'e', 'l', 'l', 'i'], 'I-PER'],
 ['donde', ['d', 'o', 'n', 'd', 'e

In [9]:
# 1.Creates the label set ( tag's set)
# 2.Creates a set with the lowercased words contained in the train,dev,test sets 
labelSet = set()
words = {}

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token,char,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

In [10]:
labelSet

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [11]:
words

{'durante': True,
 'la': True,
 'jornada': True,
 'de': True,
 'liga': True,
 'mx': True,
 ',': True,
 'león': True,
 'goleó': True,
 '3': True,
 '-': True,
 '0': True,
 'a': True,
 'dorados': True,
 'sinaloa': True,
 '.': True,
 'con': True,
 'doblete': True,
 'y': True,
 'un': True,
 'tanto': True,
 'carlos': True,
 'peña': True,
 'los': True,
 'felinos': True,
 '30': True,
 'puntos': True,
 'colocándose': True,
 'en': True,
 'segunda': True,
 'posición': True,
 'hasta': True,
 'transcurrían': True,
 'primeros': True,
 'minutos': True,
 'del': True,
 'encuentro': True,
 'al': True,
 'minuto': True,
 '10': True,
 'caería': True,
 'primer': True,
 'donde': True,
 'tiro': True,
 'gol': True,
 'que': True,
 'fue': True,
 'rechazado': True,
 'por': True,
 'portero': True,
 'las': True,
 'piernas': True,
 'boselli': True,
 'quien': True,
 'remataría': True,
 'pierna': True,
 'derecha': True,
 'para': True,
 'redes': True,
 'michel': True,
 'presión': True,
 'equipo': True,
 'se': True,
 'h

In [12]:
# Gives the labels a numerical id.
# :: Create a mapping for the labels ::
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [13]:
label2Idx

{'B-LOC': 0,
 'I-PER': 1,
 'I-MISC': 2,
 'B-ORG': 3,
 'B-MISC': 4,
 'I-ORG': 5,
 'I-LOC': 6,
 'O': 7,
 'B-PER': 8}

In [14]:
# Look up table
# :: Hard coded case lookup ::
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [15]:
case2Idx

{'numeric': 0,
 'allLower': 1,
 'allUpper': 2,
 'initialUpper': 3,
 'other': 4,
 'mainly_numeric': 5,
 'contains_digit': 6,
 'PADDING_TOKEN': 7}

In [16]:
caseEmbeddings

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [17]:
# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []
# *********************************************************************************************** PATH *************************************************
# GloVe embeddings from SBWC
# https://github.com/uchile-nlp/spanish-word-embeddings

In [18]:
#* Hace los wordEmbedings en base a la lista de embedings + revisa si la palabra en embeddings esta contenido en la lista 
# de palabras ** Nota: Remember that the words are seen as vectors.
with open("word_embeddings/SBW-vectors-300-min5.txt", encoding="utf-8") as fEmbeddings:  ## change to skip first line (headings)
    next(fEmbeddings)
    for line in fEmbeddings:
        split = line.strip().split(' ')
        word = split[0]

        if len(word2Idx) == 0: #Add padding+unknown
            word2Idx["PADDING_TOKEN"] = len(word2Idx)
            vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
            wordEmbeddings.append(vector)

            word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
            vector = np.random.uniform(-0.25, 0.25, len(split)-1)
            wordEmbeddings.append(vector)

        if split[0].lower() in words:
            vector = np.array([float(num) for num in split[1:]])
            wordEmbeddings.append(vector)
            word2Idx[split[0]] = len(word2Idx)

    wordEmbeddings = np.array(wordEmbeddings)

In [19]:
wordEmbeddings

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.1001221 ,  0.0425167 , -0.01172925, ...,  0.11235716,
         0.01272849, -0.24537758],
       [-0.029648  ,  0.011336  ,  0.019949  , ..., -0.128057  ,
        -0.004917  ,  0.062628  ],
       ...,
       [ 0.095675  , -0.068076  , -0.067965  , ..., -0.012334  ,
         0.012556  ,  0.001487  ],
       [-0.002106  , -0.017091  , -0.083709  , ..., -0.057852  ,
        -0.021364  , -0.018894  ],
       [ 0.065187  , -0.057803  , -0.010219  , ..., -0.006056  ,
        -0.022656  ,  0.072673  ]])

In [20]:
wordEmbeddings.shape[0]

136789

In [21]:
wordEmbeddings.shape[1]

300

In [22]:
char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZáéíóúñäëïöüÁÉÍÓÚÄËÏÖÜÃÂñÑàèìòùÀÈÌÒÙßÇç•æÿêğ₂ōôª'̃μ¬|¬'ø®'́¨ŠšÎńåãă><∆'.…,-–—î“”°’_()[]{}¡!¿?:;#'\"/\\%$`&=*+@^~|‘´·»³©\xadº±¼¾\xa0 ":
    char2Idx[c] = len(char2Idx)

In [None]:
# characters and position (val)
#char2Idx 

In [None]:
# words and possition (val)
#word2Idx

In [23]:
### Padding the sentences 
def padding(Sentences):
    maxlen = 52
    for sentence in Sentences:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(Sentences):
        Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')
    return Sentences

In [24]:
# classify the word according to the caseLookup ( numeric, mainly_numeric, allLower, allUpper, initialUpper, contains_digit) 
def getCasing(word, caseLookup):   
    casing = 'other'
    #Get number of digits in word
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]

In [25]:
# Create words embeding matrices to padding
def createMatrices(sentences, word2Idx, label2Idx, case2Idx,char2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in sentence:  
            wordCount += 1
            # if the word is in the list of words to index, then index it (verify with the lower cased word)
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else: # else tag it as unknown
                wordIdx = unknownIdx
                unknownWordCount += 1
            charIdx = []
            for x in char:
                charIdx.append(char2Idx[x])
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx)) #Call getCasing
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

In [26]:
# Padding the train/dev/test set and convert them to embedings

train_set = padding(createMatrices(trainSentences,word2Idx,  label2Idx, case2Idx,char2Idx))
dev_set = padding(createMatrices(devSentences,word2Idx, label2Idx, case2Idx,char2Idx))
test_set = padding(createMatrices(testSentences, word2Idx, label2Idx, case2Idx,char2Idx))


In [27]:
trainSentences[0]

[['Durante', ['D', 'u', 'r', 'a', 'n', 't', 'e'], 'O'],
 ['la', ['l', 'a'], 'O'],
 ['jornada', ['j', 'o', 'r', 'n', 'a', 'd', 'a'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['la', ['l', 'a'], 'O'],
 ['Liga', ['L', 'i', 'g', 'a'], 'B-MISC'],
 ['MX', ['M', 'X'], 'I-MISC'],
 [',', [','], 'O'],
 ['León', ['L', 'e', 'ó', 'n'], 'B-PER'],
 ['goleó', ['g', 'o', 'l', 'e', 'ó'], 'O'],
 ['3', ['3'], 'O'],
 ['-', ['-'], 'O'],
 ['0', ['0'], 'O'],
 ['a', ['a'], 'O'],
 ['Dorados', ['D', 'o', 'r', 'a', 'd', 'o', 's'], 'B-PER'],
 ['de', ['d', 'e'], 'I-PER'],
 ['Sinaloa', ['S', 'i', 'n', 'a', 'l', 'o', 'a'], 'I-PER'],
 ['.', ['.'], 'O'],
 ['Con', ['C', 'o', 'n'], 'O'],
 ['doblete', ['d', 'o', 'b', 'l', 'e', 't', 'e'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['y', ['y'], 'O'],
 ['un', ['u', 'n'], 'O'],
 ['tanto', ['t', 'a', 'n', 't', 'o'], 'O'],
 ['de', ['d', 'e'], 'O'],
 ['Carlos', ['C', 'a', 'r', 'l', 'o', 's'], 'B-PER'],
 ['Peña', ['P', 'e', 'ñ', 'a'], 'I-PER'],
 ['los', ['l', 'o', 's'], 'O'],
 ['felinos', ['f', 'e'

In [28]:
train_set[0]

[[419,
  3,
  1587,
  2,
  3,
  916,
  22974,
  1,
  903,
  30883,
  1,
  1,
  1,
  8,
  20941,
  2,
  2396,
  1,
  218,
  16246,
  2,
  6,
  14,
  105,
  2,
  504,
  2860,
  9,
  23228,
  8,
  1,
  408,
  1,
  35686,
  4,
  3,
  374,
  626,
  51,
  1,
  109272,
  9,
  551,
  748,
  10,
  1192,
  6,
  19,
  2762,
  1,
  33019,
  101,
  105,
  9,
  23228,
  61,
  4,
  14,
  5220,
  8,
  2040,
  7,
  32,
  6996,
  13,
  6112,
  2,
  20941,
  33019,
  11,
  7554,
  2,
  75498,
  139,
  89463,
  7676,
  1810,
  16,
  19,
  2,
  11,
  1573,
  2,
  7228,
  1,
  24,
  1512,
  10,
  158,
  12,
  2071,
  5266,
  4,
  2052,
  903,
  39,
  19,
  2762,
  1,
  3,
  7711,
  15481,
  10844,
  17,
  99,
  8115,
  7,
  12,
  11419,
  324,
  4,
  392,
  1,
  7,
  85184,
  111568,
  1,
  1,
  1,
  25,
  421,
  1,
  9,
  70296,
  11108,
  66845,
  1,
  49,
  4,
  5220,
  687,
  6,
  17,
  2773,
  13,
  2,
  9,
  10092,
  10,
  1257,
  6438,
  1,
  2860,
  5,
  277,
  2040,
  9,
  405,
  1,
  1,
  88199,
 

In [29]:
# save the words and labels to index as dict types
idx2Label = {v: k for k, v in label2Idx.items()}
#***************************************PATH***********************
np.save("model_data/idx2Label.npy",idx2Label)
np.save("model_data/word2Idx.npy",word2Idx)

In [30]:
# Create a batch for each set (later we will create mini-batch)
def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
    return batches,batch_len

In [31]:
train_batch,train_batch_len = createBatches(train_set)
dev_batch,dev_batch_len = createBatches(dev_set)
test_batch,test_batch_len = createBatches(test_set)

In [32]:
#train_batch_len

Start with Tensorflow. Remember that tf first construct a graph, and then run it. tf automatically determines the best contruction taking into consideration each node requirements.   

In [33]:
# Create a tensor for the inputs
words_input = Input(shape=(None,),dtype='int32',name='words_input')

In [None]:
# Create a tensor of the embeddings using the words embeddings and feeding with the words_input tensor
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input)

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
# Create a tensor of casing input
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')

In [None]:
#Create a tensor of the casing using the words embeddings and feeding with the casing_input tensor
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)

More tensors for the model....

In [None]:
character_input=Input(shape=(None,52,),name='char_input')

In [None]:
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)

In [None]:
# Establish the dropout (neurons?)
dropout= Dropout(0.5)(embed_char_out)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)

In [None]:
# max pool of the convolutional
maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)

In [None]:
# Flattern  layer for the CNN, it is requered to be flattern for the CNN
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

In [None]:
output = concatenate([words, casing,char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)

Model. Inlcudes a Summary of the Model.

In [None]:
model = Model(inputs=[words_input, casing_input,character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 30) 5400        char_input[0][0]                 
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 52, 30) 0           char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 52, 30) 2730        dropout_1[0][0]                  
__________________________________________________________________________________________________
time_distr

In [None]:
#Number of epochs
epochs = 50

In [None]:
# Minibatches
def iterate_minibatches(dataset,batch_len): 
    start = 0
    for i in batch_len:
        tokens = []
        caseing = []
        char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,c,ch,l = dt
            l = np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing),np.asarray(char)

In [None]:
#Training of n epochs 
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing,char = batch       
        model.train_on_batch([tokens, casing,char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

Epoch 0/50


In [None]:
# Saving the model
model.save("model_data/model.h5")

Evaluating model accurracy. Using F1, precision  and recal for Dev and Test sets.

In [None]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    b.update(i+1)
    return predLabels, correctLabels

In [None]:
#Method to compute the accruarcy. Call predict_labels to get the labels for the dataset
def compute_f1(predictions, correct, idx2Label): 
    label_pred = []    
    for sentence in predictions:
        label_pred.append([idx2Label[element] for element in sentence])
        
    label_correct = []    
    for sentence in correct:
        label_correct.append([idx2Label[element] for element in sentence])
            
    
    #print label_pred
    #print label_correct
    
    prec = compute_precision(label_pred, label_correct)
    rec = compute_precision(label_correct, label_pred)
    
    f1 = 0
    if (rec+prec) > 0:
        f1 = 2.0 * prec * rec / (prec + rec);
        
    return prec, rec, f1

In [None]:
def compute_precision(guessed_sentences, correct_sentences):
    assert(len(guessed_sentences) == len(correct_sentences))
    correctCount = 0
    count = 0
    
    
    for sentenceIdx in range(len(guessed_sentences)):
        guessed = guessed_sentences[sentenceIdx]
        correct = correct_sentences[sentenceIdx]
        assert(len(guessed) == len(correct))
        idx = 0
        while idx < len(guessed):
            if guessed[idx][0] == 'B': #A new chunk starts
                count += 1
                
                if guessed[idx] == correct[idx]:
                    idx += 1
                    correctlyFound = True
                    
                    while idx < len(guessed) and guessed[idx][0] == 'I': #Scan until it no longer starts with I
                        if guessed[idx] != correct[idx]:
                            correctlyFound = False
                        
                        idx += 1
                    
                    if idx < len(guessed):
                        if correct[idx][0] == 'I': #The chunk in correct was longer
                            correctlyFound = False
                        
                    
                    if correctlyFound:
                        correctCount += 1
                else:
                    idx += 1
            else:  
                idx += 1
    
    precision = 0
    if count > 0:    
        precision = float(correctCount) / count
        
    return precision

In [None]:
#   Performance on dev dataset        
predLabels, correctLabels = tag_dataset(dev_batch)        
pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, idx2Label)
print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev))

In [None]:
#   Performance on test dataset       
predLabels, correctLabels = tag_dataset(test_batch)        
pre_test, rec_test, f1_test= compute_f1(predLabels, correctLabels, idx2Label)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))

Test with data

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

Defining class for testing.

In [None]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize

class Parser:

    def __init__(self):
        # ::Hard coded char lookup ::
        self.char2Idx = {"PADDING":0, "UNKNOWN":1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
            self.char2Idx[c] = len(self.char2Idx)
        # :: Hard coded case lookup ::
        self.case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}

    def load_models(self, loc=None):
        if not loc:
            loc = os.path.join(os.path.expanduser('~'), '.ner_model')
        self.model = load_model(os.path.join(loc,"model.h5"))
        # loading word2Idx
        self.word2Idx = np.load(os.path.join(loc,"word2Idx.npy")).item()
        # loading idx2Label
        self.idx2Label = np.load(os.path.join(loc,"idx2Label.npy")).item()

    def getCasing(self,word, caseLookup):   
        casing = 'other'
        
        numDigits = 0
        for char in word:
            if char.isdigit():
                numDigits += 1
                
        digitFraction = numDigits / float(len(word))
        
        if word.isdigit(): #Is a digit
            casing = 'numeric'
        elif digitFraction > 0.5:
            casing = 'mainly_numeric'
        elif word.islower(): #All lower case
            casing = 'allLower'
        elif word.isupper(): #All upper case
            casing = 'allUpper'
        elif word[0].isupper(): #is a title, initial char upper, then all lower
            casing = 'initialUpper'
        elif numDigits > 0:
            casing = 'contains_digit'  
        return caseLookup[casing]

    def createTensor(self,sentence, word2Idx,case2Idx,char2Idx):
        unknownIdx = word2Idx['UNKNOWN_TOKEN']
    
        wordIndices = []    
        caseIndices = []
        charIndices = []
            
        for word,char in sentence:  
            word = str(word)
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
            charIdx = []
            for x in char:
                if x in char2Idx.keys():
                    charIdx.append(char2Idx[x])
                else:
                    charIdx.append(char2Idx['UNKNOWN'])   
            wordIndices.append(wordIdx)
            caseIndices.append(self.getCasing(word, case2Idx))
            charIndices.append(charIdx)
            
        return [wordIndices, caseIndices, charIndices]

    def addCharInformation(self, sentence):
        return [[word, list(str(word))] for word in sentence]

    def padding(self,Sentence):
        Sentence[2] = pad_sequences(Sentence[2],52,padding='post')
        return Sentence

    def predict(self,Sentence):
        Sentence = words =  word_tokenize(Sentence)
        Sentence = self.addCharInformation(Sentence)
        Sentence = self.padding(self.createTensor(Sentence,self.word2Idx,self.case2Idx,self.char2Idx))
        tokens, casing,char = Sentence
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = self.model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1)
        pred = [self.idx2Label[x].strip() for x in pred]

        return  list(zip(words,pred))
       

In [None]:
p = Parser()
p.load_models("model_data/")

In [None]:
from nltk import sent_tokenize
text_file = open("Input.txt").read()
token_sent = sent_tokenize(text_file)

In [None]:
print(token_sent)

Input: 

In [None]:
print(text_file)

In [None]:
outlist =[]
for t in token_sent:
    t= unidecode.unidecode(t)
    outlist.append(p.predict(t))

to_out=[]
for s in outlist:
    for w in s:
        if ('O') not in w:
            print(w)
            to_out.append(w)
            
with open('Output_sample.txt', 'w') as f:
    for item in to_out:
        f.write("\n")
        for x in item:
            f.write("%s " %x)