In [314]:
# util.py

import numpy as np
import pandas as pd
import re, json
import re, os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

def read_data(filepath):
    df = pd.DataFrame(columns=['word', 'ne'])  # syntatic chunck, named entity
    char = set()
    nrow = 0
    with open(filepath, 'r', encoding='utf-8') as file:
        
        word, ne = [], []
        for row in file:
            row = row.strip()
            if '-DOCSTART-' in row:
                next(file)
                continue
                
            if row:
                row = row.split(' ')
                word.append(row[0].lower())
                ne.append(row[3])
            else:
                df.loc[nrow] = [word, ne]
                word, ne = [], []
                nrow += 1
    
    char2index = {}
    chars = set(''.join([w for word in df['word'] for w in word] + ['#']))
    for i, char in enumerate(sorted(chars)):
        char2index[char] = i
    char2index.update({'<unk>': len(char2index), '<pad>': len(char2index) + 1})
    return df, char2index


def read_wordvec(wordvec_file):
    
    vocab = set()
    wordvec = []
    with open(wordvec_file, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            line = line.strip().split()
            vocab.add(line[0])
            wordvec.append(line[1:])
    
    vocab.add('<UNK>')
    vocab.add('<PAD>')
    wordvec.extend([[0] * len(wordvec[0])] * 2)
    word2index = {}
    index2word = []
    for i, word in enumerate(sorted(vocab)):
        word2index[word] = i
        index2word.append(word)
    return np.array(wordvec, dtype=np.float32), word2index, index2word
        

def normalize_number(words):
    return [re.sub(r'[0-9]+[\+|\-|,|.|/]?[0-9]+', '0', word) for word in words]


def tag_to_index(tags):
    tag2index = {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
    return [tag2index[tag] for tag in tags]


def index_to_tag(tags):
    index2tag = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']    
    return [index2tag[tag] for tag in tags]


def word_to_index(words, word2index):
    unk_word = word2index['<UNK>']
    return [word2index.get(word, unk_word) for word in words]


def char_to_index(words, char2index):
    unk_char = char2index['<unk>']
    return [[char2index.get(char, unk_char) for char in word] for word in words]


def preprocess(train_df, word2index, char2index):
    
    train_df['word'] = train_df['word'].map(normalize_number)
    xxx = train_df.copy()
    train_df['char'] = train_df['word'].map(lambda words: [list(word) for word in words])
    train_df['word'] = train_df['word'].map(lambda words: word_to_index(words, word2index))
    train_df['char'] = train_df['char'].map(lambda words: char_to_index(words, char2index))
    train_df['ne']   = train_df['ne'].map(tag_to_index)    
    
    word_len = np.array([len(x) for x in train_df['word']])
    char_en = np.array([[len(word) for word in words] for words in train_df['char'] ])
    
    return train_df['word'].values, train_df['char'].values, train_df['ne'].values, word_len, char_en, xxx


def load_train_data():
    word_sent = np.load('data/word_sent.npy')
    char_sent = np.load('data/char_sent.npy')
    tag = np.load('data/tag.npy')
    word_len = np.load('data/word_len.npy')
    char_len = np.load('data/char_len.npy')
    wordvec = np.load('data/wordvec.npy')
    
    with open('data/word2index.json', 'r', encoding='utf-8') as file:
        word2index = json.load(file)
        
    with open('data/index2word.json', 'r', encoding='utf-8') as file:
        index2word = json.load(file)

    with open('data/char2index.json', 'r', encoding='utf-8') as file:
        char2index = json.load(file)
    
    return word_sent, char_sent, tag, word_len, char_len, wordvec, word2index, index2word, char2index


def load_test_data():
    word_sent = np.load('data/test_word_sent.npy')
    char_sent = np.load('data/test_char_sent.npy')
    tag = np.load('data/test_tag.npy')
    word_len = np.load('data/test_word_len.npy')
    char_len = np.load('data/test_char_len.npy')
    wordvec = np.load('data/wordvec.npy')
    
    with open('data/word2index.json', 'r', encoding='utf-8') as file:
        word2index = json.load(file)
        
    with open('data/index2word.json', 'r', encoding='utf-8') as file:
        index2word = json.load(file)

    with open('data/char2index.json', 'r', encoding='utf-8') as file:
        char2index = json.load(file)
    
    return word_sent, char_sent, tag, word_len, char_len, wordvec, word2index, index2word, char2index


def train_val_split(x_train, x_char_train, y_train, seq_len, word_len, train_ratio=.7):
    train_len  = int(len(x_train) * train_ratio)
    train_data = [x_train[: train_len], x_char_train[: train_len], y_train[: train_len], seq_len[: train_len], word_len[: train_len]]
    val_data   = [x_train[train_len: ], x_char_train[train_len: ], y_train[train_len: ], seq_len[train_len: ], word_len[train_len: ]] 
    
    return train_data, val_data


def shuffle_data(data):
    indice = np.arange(len(data[0]))
    np.random.shuffle(indice)
    
    return [d[indice] for d in data]


def next_batch(data, batch_size, word2index, char2index):
    def pad(sequence, max_wlen, pad_token):
        return np.array([seq + [pad_token] * (max_wlen - len(seq)) for seq in sequence])
    
    def char_pad(sequence, max_wlen, max_clen, pad_token):
        pad_seq = []
        for words in sequence:
            pad_words = words + [[pad_token]] * (max_wlen - len(words))            
            pad_seq.append([word + [pad_token] * (max_clen - len(word)) for word in pad_words])
        return np.array(pad_seq)
            

    word_sent, char_sent, tag, word_len, char_len = data[0], data[1], data[2], data[3], data[4] 
    n_batch = len(word_sent) // batch_size
    for i in range(n_batch):
        offset = i * batch_size
        indice = np.arange(offset, offset + batch_size)
        batch_wlen = word_len[indice]
        batch_clen = pad(char_len[indice], max(batch_wlen), 0)
        batch_word = pad(word_sent[indice], max(batch_wlen), word2index['<PAD>'])
        batch_char = char_pad(char_sent[indice], max(batch_wlen), max([max(clen) for clen in batch_clen]), char2index['<pad>'])
        batch_tag  = pad(tag[indice], max(batch_wlen), 8) if tag.any() else []
        
        yield batch_word, batch_char, batch_tag, batch_wlen, batch_clen
    
    
    
    offset = n_batch * batch_size
    if offset == len(word_sent):
        return
    
    batch_wlen = word_len[offset: ]
    batch_clen = pad(char_len[offset: ], max(batch_wlen), 0)
    batch_word = pad(word_sent[offset: ], max(batch_wlen), word2index['<PAD>'])
    batch_char = char_pad(char_sent[offset: ], max(batch_wlen), max([max(clen) for clen in batch_clen]), char2index['<pad>'])
    batch_tag  = pad(tag[offset: ], max(batch_wlen), 8) if tag.any() else []

    yield batch_word, batch_char, batch_tag, batch_wlen, batch_clen


def get_entities(sequence_tag):
    
#     entity = {
#         'begin': 0,
#         'end': 0,
#         'type': '' 
#     }
    entities = []
    is_ne = False
    ne_type = ''
    for i, tag in enumerate(sequence_tag):
        if is_ne and ('B-' in tag or 'O' in tag):
            entities.append({'begin': begin, 'end': i, 'type': ne_type}) 
            is_ne = False
        if 'B-' in tag:
            begin = i
            is_ne = True
            ne_type = tag.split('-')[1]
        elif 'I-' in tag and ne_type != tag.split('-')[1]:
            is_ne = False
    if is_ne:
        entities.append({'begin': begin, 'end': i, 'type': ne_type})
        
    return entities     
        

In [315]:
# metrics.py

def accuracy(predictions, tags):
    def equal(pred_ne, tag_ne):
        return pred_ne
    total, correct = 0, 0
    for prediction, tag in zip(predictions, tags):
        pred_entities = get_entities(prediction)
        tag_entities =  get_entities(tag)
        t_len, p_len, offset = len(tag_entities), len(pred_entities), 0
        for tag_ne in tag_entities:
            for i in range(offset, p_len):
                if tag_ne == pred_entities[i]:
                    correct += 1
                    offset += 1
        total += t_len
        
    return correct / total, total, correct
                                        

   
            
            
    
    

### Load Word Vector

In [292]:
%%time
wordvec_file = 'wordvectors/glove.6B.100d.txt'
wordvec, word2index, index2word = read_wordvec(wordvec_file)

CPU times: user 10.1 s, sys: 860 ms, total: 11 s
Wall time: 11 s


### Data Preprocess

In [300]:
data_file = 'data/train.txt'
test_file = 'data/test.txt'
data_df, char2index = read_data(data_file)
test_df, _ = read_data(test_file)

In [303]:
data_df.head()

Unnamed: 0,word,ne
0,"[eu, rejects, german, call, to, boycott, briti...","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]"
1,"[peter, blackburn]","[B-PER, I-PER]"
2,"[brussels, 1996-08-22]","[B-LOC, O]"
3,"[the, european, commission, said, on, thursday...","[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,..."
4,"[germany, 's, representative, to, the, europea...","[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-P..."


In [302]:
word_sent, char_sent, tag, word_len, char_len, data = preprocess(data_df.copy(), word2index, char2index)

In [305]:
data.head(100)

Unnamed: 0,word,ne
0,"[eu, rejects, german, call, to, boycott, briti...","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]"
1,"[peter, blackburn]","[B-PER, I-PER]"
2,"[brussels, 0]","[B-LOC, O]"
3,"[the, european, commission, said, on, thursday...","[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,..."
4,"[germany, 's, representative, to, the, europea...","[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-P..."
5,"["", we, do, n't, support, any, such, recommend...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6,"[he, said, further, scientific, study, was, re...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[he, said, a, proposal, last, month, by, eu, f...","[O, O, O, O, O, O, O, B-ORG, O, O, B-PER, I-PE..."
8,"[fischler, proposed, eu0wide, measures, after,...","[B-PER, O, B-MISC, O, O, O, O, B-LOC, O, B-LOC..."
9,"[but, fischler, agreed, to, review, his, propo...","[O, B-PER, O, O, O, O, O, O, O, B-ORG, O, O, O..."


In [24]:
np.save('data/word_sent.npy', word_sent)
np.save('data/char_sent.npy', char_sent)
np.save('data/tag.npy', tag)
np.save('data/word_len.npy', word_len)
np.save('data/char_len.npy', char_len)
np.save('data/wordvec.npy', wordvec)

with open('data/word2index.json', 'w', encoding='utf-8') as file:
    json.dump(word2index, file)
    
with open('data/index2word.json', 'w', encoding='utf-8') as file:
    json.dump(index2word, file)
    
with open('data/char2index.json', 'w', encoding='utf-8') as file:
    json.dump(char2index, file)

In [25]:
test_word_sent, test_char_sent, test_tag, test_word_len, test_char_len = preprocess(test_df.copy(), word2index, char2index)

In [26]:
np.save('data/test_word_sent.npy', test_word_sent)
np.save('data/test_char_sent.npy', test_char_sent)
np.save('data/test_tag.npy', test_tag)
np.save('data/test_word_len.npy', test_word_len)
np.save('data/test_char_len.npy', test_char_len)

### Build Graph

In [138]:
import tensorflow as tf
from tensorflow.nn import embedding_lookup, bidirectional_dynamic_rnn, dropout
from tensorflow.nn.rnn_cell import LSTMCell, DropoutWrapper
from tensorflow.contrib.crf import crf_log_likelihood, viterbi_decode
from tqdm import tqdm_notebook
from pprint import pprint
import os

In [316]:
# layers.py
def BiLSTM(sequence, seq_len, unit, name):
    cell_fw = LSTMCell(unit, name='fw' + name)
    cell_bw = LSTMCell(unit, name='bw' + name)
#     cell_fw = DropoutWrapper(cell_fw,  output_keep_prob=0.5)
#     cell_bw = DropoutWrapper(cell_bw,  output_keep_prob=0.5)
    
    (outputs, states) = bidirectional_dynamic_rnn(cell_fw, cell_bw, sequence, seq_len, dtype=tf.float32)
    # outputs: (output_fw, output_bw), both with shape (batch_size, max_len, unit)
    # states:  ((cell_fw, state_fw), (cell_bw, state_bw)), fw & bw final state with shape (batch_size, unit)
    return outputs, states





In [317]:
class NERTagger:
    def __init__(self, wordvec, config):
        '''
        ##### placeholder #####
        word_sent: word-level sentence, (batch_size, max_word_len)
        char_sent: char-level sentence, (batch_size, max_word_len, max_char_len)
        word_len:  sentence length, (batch_size, )
        char_len:  word length, (batch_size, max_word_len)
        tag: answer tag, (batch_size, max_word_len)
        '''
        tf.reset_default_graph()
        self.config = config
        
        self.hidden_unit = config['hidden_unit']
        self.context_lstm_unit = config['context_lstm_unit']
        self.char_vocab_size = config['char_vocab_size']
        self.char_emb_dim = config['char_emb_dim']
        self.char_lstm_unit = config['char_lstm_unit']
        self.num_class = config['num_class']
        self.learning_rate = config['learning_rate']
        
        self.word_sent = tf.placeholder(tf.int32, (None, None)) 
        self.char_sent = tf.placeholder(tf.int32, (None, None, None))
        self.word_len  = tf.placeholder(tf.int32, (None, ))
        self.char_len  = tf.placeholder(tf.int32, (None, None))
        self.tag = tf.placeholder(tf.int32, (None, None))
        
        
        self.w = tf.get_variable('hidden_weight', (self.context_lstm_unit * 2, self.hidden_unit)) 
        self.b = tf.get_variable('hidden_bias', (self.hidden_unit, ))
        
        self.word_embedding = tf.get_variable(name='word_embedding', 
                                              shape=wordvec.shape, 
                                              initializer=tf.constant_initializer(wordvec, dtype=tf.float32),
                                              dtype=tf.float32,
                                              trainable=config['word_emb_trainable'])
        
        
        self.char_embedding = tf.get_variable(name='char_embedding', 
                                              shape=(self.char_vocab_size, self.char_emb_dim), 
                                              initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0),
                                              dtype=tf.float32,
                                              trainable=True)
        
    def embedding_layer(self, word_sent, char_sent, char_len):
        '''
        word_sent: word-level sentence, (batch_size, max_word_len)
        char_sent: char-level sentence, (batch_size, max_word_len, max_char_len)
        word_len:  sentence length, (batch_size, )
        char_len:  word length, (batch_size, max_word_len)
        word_embedding: word embedding matrix, (word_vocab_size, word_emb_dim)
        '''
        max_word_len = tf.shape(word_sent)[1]
        max_char_len = tf.shape(char_sent)[2]
        
        word_emb = embedding_lookup(self.word_embedding, word_sent)  # (batch_size, max_word_len, word_emb_dim)
        char_emb = embedding_lookup(self.char_embedding, char_sent)  # (batch_size, max_word_len, max_char_len, char_emb_dim)

        
        # Reshape char_emb to 3D tensor for BiLSTM input
        char_emb = tf.reshape(char_emb, (-1, max_char_len, self.char_emb_dim))  # (batch_size * max_word_len, max_char_len, char_emb_dim)
        char_len = tf.reshape(char_len, (-1, ))   # (batch_size * max_word_len, )
        
        # Get final states which represent char-level representation
        _, states = BiLSTM(char_emb, char_len, self.char_lstm_unit, 'char_embedding')
        final_h = [states[0][1], states[1][1]]  # [forward final state, backward final state]
        char_emb = tf.concat(final_h, axis=1)  # (batch_size * max_word_len, char_lstm_unit * 2)
        
        # Reshape char_emb to match word_emb's shape for concatenating both tensors
        char_emb = tf.reshape(char_emb, (-1, max_word_len, self.char_lstm_unit * 2))  # (batch_size, max_word_len, char_lstm_unit * 2)
        
        # Token Representation
        emb = tf.concat([word_emb, char_emb], axis=2)  # (batch_size, max_word_len, word_emb_dim + char_lstm_unit * 2)
        emb = dropout(emb, keep_prob=0.5)
        return emb
    
    
    def build(self):
        ##### Context Representation #####
        # emb_dim = word_emb_dim + char_lstm_unit * 2
        word_rep   = self.embedding_layer(self.word_sent, self.char_sent, self.char_len)   # (batch_size, max_word_len, emb_dim)
        outputs, _ = BiLSTM(word_rep, self.word_len, self.context_lstm_unit, 'context_representation') 
        context    = tf.concat(outputs, axis=2)  # (batch_size, max_word_len, context_lstm_unit * 2)
        
        ##### Hidden Layer #####
        max_word_len = tf.shape(context)[1]
        context      = tf.reshape(context, (-1, self.context_lstm_unit * 2))   # (batch_size * max_word_len, context_lstm_unit * 2)
        dense        = tf.matmul(context, self.w) + self.b                     # (batch_size * max_word_len, hidden_unit)
        self.scores  = tf.reshape(dense, (-1, max_word_len, 100))   # (batch_size, max_word_len, hidden_unit)
        
        ##### CRF #####
        log_likelihood, self.transition_params = crf_log_likelihood(self.scores, self.tag, self.word_len)
        self.loss = tf.reduce_mean(-log_likelihood)
        
#         self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)

                
    
    def fit(self, train_data, val_data, epoch_size, batch_size, word2index, model_name):
        def learn(data, epoch, mode):
            tn = tqdm_notebook(total=len(data[0]))
            nbatch, epoch_loss, epoch_acc = 0, 0, 0 
            for batch_word, batch_char, batch_tag, batch_wlen, batch_clen in next_batch(data, batch_size, word2index, char2index):
                feed_dict = {
                    self.word_sent: batch_word,
                    self.char_sent: batch_char, 
                    self.word_len: batch_wlen,
                    self.char_len: batch_clen,
                    self.tag: batch_tag
                }
                if mode == 'train':
                    fetches = [self.loss, self.optimizer]
                    loss, _ = self.sess.run(fetches, feed_dict)
                    tn.set_description('Epoch: {}/{}'.format(epoch, epoch_size))
                elif mode == 'validate':                    
                    fetches = [self.loss]
                    loss = self.sess.run(fetches, feed_dict)[0]
                
#                 acc = accuracy(output, label)
                tn.set_postfix(loss=loss, mode=mode)
                tn.update(n=len(batch_word))
                
                epoch_loss += loss
#                 epoch_acc += acc
                nbatch += 1
            
            tn.set_postfix(loss=epoch_loss/nbatch, mode=mode)
            return [epoch_loss/nbatch]
                
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
                
        train_log, val_log = [], []
        print('Train on {} samples, validate on {} samples'.format(len(train_data[0]), len(val_data[0])))
        for epoch in range(1, epoch_size + 1):       
            train_data = shuffle_data(train_data)
            # train
            train_log.append(learn(train_data, epoch, 'train'))

            # validate
            if len(val_data[0]) > 0:
                val_log.append(learn(val_data, epoch, 'validate')) 
        
        self.save(model_name, train_log, val_log)
    
    
    def predict(self, data, word_to_index, char2index):
        
        tn = tqdm_notebook(total=len(data[0]))
        batch_size = 100
        transition_params = self.transition_params.eval(session=self.sess)
        prediction = []
        for batch_word, batch_char, batch_tag, batch_wlen, batch_clen in next_batch(data, batch_size, word2index, char2index):
            fetches = [self.scores]
            feed_dict = {
                self.word_sent: batch_word,
                self.char_sent: batch_char, 
                self.word_len: batch_wlen,
                self.char_len: batch_clen
            }
            scores = self.sess.run(fetches, feed_dict)[0]
            scores = [score[:wlen] for score, wlen in zip(scores, batch_wlen)]
            prediction.extend([viterbi_decode(score, transition_params)[0] for score in scores])
            
            tn.set_postfix(mode='predict')
            tn.update(n=len(batch_word))
        
        return np.array(prediction)
    
    
    def save(self, model_name, train_log, val_log):
        model_dir = 'models/{}'.format(model_name)
        if not os.path.isdir(model_dir):
            os.mkdir(model_dir)
            os.mkdir('{}/result'.format(model_dir))
        
        # save model
        saver = tf.train.Saver()
        save_path = saver.save(self.sess, '{}/{}.ckpt'.format(model_dir, model_name))
        
        # save config
        with open('{}/config.json'.format(model_dir), 'w', encoding='utf-8') as file:
            json.dump(self.config, file)
            
        # save log
        with open('{}/log'.format(model_dir), 'w', encoding='utf-8') as file:
            for i in range(len(train_log)):
                tlog = train_log[i]
                vlog = val_log[i] if len(val_log) > 0 else []
                log_str = 'Epoch {}: train_loss={}'.format(i+1, tlog[0])
                log_str += ', val_loss={}'.format(vlog[0]) if vlog else ''
                file.write(log_str + '\n')
            
        print('Model was saved in {}'.format(save_path))
    
    
    def restore(self, model_path):
        saver = tf.train.Saver()
        self.sess = tf.Session()
        saver.restore(self.sess, model_path)


In [318]:
word_sent, char_sent, tag, word_len, char_len, wordvec, word2index, index2word, char2index = load_train_data()

In [319]:
train_data, val_data = train_val_split(word_sent, char_sent, tag, word_len, char_len, train_ratio=.8)

In [244]:
# n = 0

In [245]:

epoch_size = 30
batch_size = 1
config = {
    'num_class': 9,
    'char_lstm_unit': 25,
    'context_lstm_unit': 100,
    'hidden_unit': 100,
    'word_emb_dim': len(wordvec[0]),
    'char_emb_dim': 25,
    'word_vocab_size': len(wordvec),
    'char_vocab_size': len(char2index),
    'learning_rate': 1e-2,
    'wordvec': wordvec_file,
    'word_emb_trainable': True,
    'epoch_size': epoch_size,
    'batch_size': batch_size
}



In [246]:
tagger = NERTagger(wordvec, config)

In [247]:
tagger.build()

In [248]:
n += 1
model_name = 'model-{}'.format(n)
print(model_name)
tagger.fit(train_data, val_data, epoch_size, batch_size, word2index, model_name)                     

model-18
Train on 13832 samples, validate on 3459 samples


HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

Model was saved in models/model-18/model-18.ckpt


In [253]:
test_word_sent, test_char_sent, test_tag, test_word_len, test_char_len, _, _, _, _ = load_test_data()
test_data = [test_word_sent, test_char_sent, test_tag, test_word_len, test_char_len]

In [108]:
tagger.restore('models/{}/{}.ckpt'.format(model_name, model_name))

INFO:tensorflow:Restoring parameters from models/model-3/model-3.ckpt


In [283]:
prediction = tagger.predict(test_data, word_to_index, char2index)
accuracy(pd.Series(prediction).map(index_to_tag), pd.Series(test_data[2]).map(index_to_tag))

HBox(children=(IntProgress(value=0, max=3453), HTML(value='')))

(0.722556657223796, 5648, 4081)

In [284]:
prediction = tagger.predict(train_data, word_to_index, char2index)
accuracy(pd.Series(prediction).map(index_to_tag), pd.Series(train_data[2]).map(index_to_tag))

HBox(children=(IntProgress(value=0, max=13832), HTML(value='')))

(0.9398798461338981, 23137, 21746)

In [260]:
np.mean(np.equal(prediction, test_data[2]))

0.6379959455545902

In [282]:
prediction = tagger.predict(val_data, word_to_index, char2index)
accuracy(pd.Series(prediction).map(index_to_tag), pd.Series(val_data[2]).map(index_to_tag))

HBox(children=(IntProgress(value=0, max=3459), HTML(value='')))

(0.7869606598984772, 6304, 4961)

(0.7193696883852692, 5648, 4063)

In [262]:
pd.Series(prediction[:5]).map(index_to_tag)

0     [O, O, B-LOC, O, B-ORG, O, O, B-LOC, O, O, O, O]
1                                       [B-PER, I-PER]
2                   [B-ORG, O, B-ORG, I-ORG, I-ORG, O]
3    [B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...
4    [O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...
dtype: object

In [261]:
pd.Series(test_data[2][:5]).map(index_to_tag)

0         [O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O]
1                                       [B-PER, I-PER]
2                   [B-LOC, O, B-LOC, I-LOC, I-LOC, O]
3    [B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...
4    [O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...
dtype: object

In [929]:
x = np.array([
             [['i', '#', '#', '#', '#', '#', '#'], 
              ['a', 'm', '#', '#', '#', '#', '#'],
              ['n', 'o', 't', '#', '#', '#', '#'],
              ['a', '#', '#', '#', '#', '#', '#'],
              ['s', 't', 'u', 'd', 'e', 'n', 't']],
              
             [['y', 'o', 'u', '#', '#', '#', '#'], 
              ['a', 'r', 'e', '#', '#', '#', '#'],
              ['a', '#', '#', '#', '#', '#', '#'],
              ['t', 'e', 'a', 'c', 'h', 'e', 'r'],
              ['#', '#', '#', '#', '#', '#', '#']]])
print(x.shape)
pprint(x)

(2, 5, 7)
array([[['i', '#', '#', '#', '#', '#', '#'],
        ['a', 'm', '#', '#', '#', '#', '#'],
        ['n', 'o', 't', '#', '#', '#', '#'],
        ['a', '#', '#', '#', '#', '#', '#'],
        ['s', 't', 'u', 'd', 'e', 'n', 't']],

       [['y', 'o', 'u', '#', '#', '#', '#'],
        ['a', 'r', 'e', '#', '#', '#', '#'],
        ['a', '#', '#', '#', '#', '#', '#'],
        ['t', 'e', 'a', 'c', 'h', 'e', 'r'],
        ['#', '#', '#', '#', '#', '#', '#']]], dtype='<U1')


In [930]:
y = x.reshape((-1, 7))
print(y.shape)
print(y)

(10, 7)
[['i' '#' '#' '#' '#' '#' '#']
 ['a' 'm' '#' '#' '#' '#' '#']
 ['n' 'o' 't' '#' '#' '#' '#']
 ['a' '#' '#' '#' '#' '#' '#']
 ['s' 't' 'u' 'd' 'e' 'n' 't']
 ['y' 'o' 'u' '#' '#' '#' '#']
 ['a' 'r' 'e' '#' '#' '#' '#']
 ['a' '#' '#' '#' '#' '#' '#']
 ['t' 'e' 'a' 'c' 'h' 'e' 'r']
 ['#' '#' '#' '#' '#' '#' '#']]


In [None]:
config

In [None]:
y[0]

In [None]:
char = set('aaaafdafkf,djfpe')

In [None]:
char.add('aaaafdafkf,djfpe')

In [584]:
x = list(char)

In [493]:
x = [w for word in data_df['word'] for w in word ] 

In [497]:
char = set(''.join(x))

In [238]:
for i in range(10, 1):
    print(i)

In [1014]:
x = []
batch_size = 2
for batch_word, batch_char, batch_tag, batch_wlen, batch_clen in next_batch(train_data, batch_size, word2index, char2index):

    print(batch_char.shape)  
    for c in batch_char:
        x.append(c)
    break
    
print(np.array(x).shape)

(2, 47, 9)
(2, 47, 9)


In [991]:
pprint(batch_char)

array([list([[37, 53, 60, 60, 60, 60, 60, 60, 60], [50, 37, 42, 37, 35, 52, 51, 60, 60], [39, 37, 50, 45, 33, 46, 60, 60, 60], [35, 33, 44, 44, 60, 60, 60, 60, 60], [52, 47, 60, 60, 60, 60, 60, 60, 60], [34, 47, 57, 35, 47, 52, 52, 60, 60], [34, 50, 41, 52, 41, 51, 40, 60, 60], [44, 33, 45, 34, 60, 60, 60, 60, 60], [15, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60], [60, 60, 60, 60, 60, 60, 60, 60, 60],

In [983]:
pprint(x)

[[[37, 53, 60, 60, 60, 60, 60, 60, 60],
  [50, 37, 42, 37, 35, 52, 51, 60, 60],
  [39, 37, 50, 45, 33, 46, 60, 60, 60],
  [35, 33, 44, 44, 60, 60, 60, 60, 60],
  [52, 47, 60, 60, 60, 60, 60, 60, 60],
  [34, 47, 57, 35, 47, 52, 52, 60, 60],
  [34, 50, 41, 52, 41, 51, 40, 60, 60],
  [44, 33, 45, 34, 60, 60, 60, 60, 60],
  [15, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],
  [60, 60, 60, 60, 60, 60, 60, 60, 60],


In [312]:
re.sub(r'[0-9]+[\-|/][0-9]+[\-|/][0-9]+', '<DATE>', '102-0-00')

'<DATE>'

In [None]:
re.sub(r'[0-9]+[\+|\-|,|.|/]+', '0', '102-0-0-0')