## Data

In [1]:
!pip install keras-crf
!pip install tensorflow==1.15.0 keras==2.2.4
!pip install sklearn_crfsuite
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-79sdwgcl
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-79sdwgcl
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp37-none-any.whl size=101065 sha256=28ff89d55f8d9ce70a233c2595c8c9a405746f1bd2423dc22af75231a8a544f1
  Stored in directory: /tmp/pip-ephem-wheel-cache-x0gh4fle/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import *
from keras.optimizers import Adam
from sklearn_crfsuite import metrics
from keras_contrib.layers import CRF
from keras.utils import to_categorical
from keras_contrib.losses import crf_loss
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras_contrib.metrics import crf_viterbi_accuracy, crf_marginal_accuracy
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input

Using TensorFlow backend.


In [3]:
!gdown --id 1cCXjHX9FAgouF0DWuWPO5qyBHVZDkXTQ
!gdown --id 1RFNBRcly96omdpNtYlEK6O1EuQs3ux7t
!gdown --id 1AnLqXtSyJNBK7YmwuW6L6z2OYhE4RPTV

Downloading...
From: https://drive.google.com/uc?id=1cCXjHX9FAgouF0DWuWPO5qyBHVZDkXTQ
To: /content/NER_PAD_agg_test.csv
2.92MB [00:00, 94.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RFNBRcly96omdpNtYlEK6O1EuQs3ux7t
To: /content/NER_PAD_agg_train.csv
11.7MB [00:00, 44.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1AnLqXtSyJNBK7YmwuW6L6z2OYhE4RPTV
To: /content/NER_PAD_agg.csv
14.6MB [00:00, 55.2MB/s]


In [4]:
df = pd.read_csv('NER_PAD_agg.csv', converters={'tokens': eval, 'ner_tags': eval})
train = pd.read_csv('NER_PAD_agg_train.csv', converters={'tokens': eval, 'ner_tags': eval})
test = pd.read_csv('NER_PAD_agg_test.csv', converters={'tokens': eval, 'ner_tags': eval})

all_dfs = [df, train, test]

In [5]:
all_tags = set()
word_to_ix = {}
for _, row in df.iterrows():
    for tag in row.ner_tags:
        all_tags.add(tag)
    for word in row.tokens:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
all_tags = sorted(list(all_tags))
tag_to_idx = {t: i for i, t in enumerate(all_tags)}
label_list = {i: t for i, t in enumerate(all_tags)}
labels = list(tag_to_idx.keys())
labels.remove('O')
labels = sorted(labels, key=lambda name: (name[1:], name[0]))

In [6]:
def find_token(x):
    res = []
    for w in x['tokens']:
        if w in word_to_ix.keys():
            res.append(word_to_ix[w])
        else:
            res.append(0)
    return res

for d in all_dfs:
    d['encoded_ner_tags'] = d.apply(lambda x: [tag_to_idx[t] for t in x['ner_tags']], axis=1)
    d['int_tokens'] = d.apply(find_token, axis=1)

In [7]:
df

Unnamed: 0,id,tokens,ner_tags,encoded_ner_tags,int_tokens
0,SV990125172_ОРГ__INM_18'09'58_1.pdf,"[l, *, универсальный, приложение, №, 1, постан...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-D...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 12, ..."
1,SS528581_ОРГ__INM_17'05'39_A.pdf,"[l, лист, согласования, стр., 1, 1, 2, 9, *, 0...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 242, 243, 244, 5, 5, 94, 102, 1, 180, 97, ..."
2,SF435712_ОРГ__INM_15'25'41_A.pdf,"[l, приложение, №, постановлению, правительств...","[O, O, O, O, O, O, O, O, O, B-DOCNUM, B-DOCDAT...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 7, 6, 13,...","[0, 3, 4, 6, 7, 294, 295, 11, 4, 296, 13, 297,..."
3,SV990124943_ОРГ__INM_19'00'43_A.pdf,"[l, акционерное, общество, регистраторское, об...","[O, O, O, O, O, B-DOCCPTY, O, O, O, O, O, O, O...","[15, 15, 15, 15, 15, 3, 15, 15, 15, 15, 15, 15...","[0, 364, 365, 366, 365, 367, 368, 53, 35, 369,..."
4,S29013419_ОРИГИНАЛ__IND_1_22.05.12'21'36.pdf,"[l, *, 000стандарт, безопасности, адрес:, 1973...","[O, O, O, B-DOCCPTY, O, O, O, O, O, O, O, O, O...","[15, 15, 15, 3, 15, 15, 15, 15, 15, 15, 15, 15...","[0, 1, 462, 463, 464, 465, 466, 467, 468, 469,..."
...,...,...,...,...,...
3995,SV990121575_ОРГ__INM_18'45'08_3.pdf,"[l, экземпляр, аостс, стс, отчет, роялти, 2, к...","[O, O, O, O, O, O, O, O, O, O, O, B-DOCAGRNUM,...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 2091, 32885, 975, 3682, 19158, 94, 398, 26..."
3996,SV990121572_ОРГ__INM_18'45'08_3.pdf,"[l, ао, стс, отчет, роялти, 3, квартал, лиценз...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 476, 975, 3682, 19158, 96, 398, 26324, 481..."
3997,SV990121520_ОРГ__INM_16'31'06_3.pdf,"[l, экземпляр, ао, стс, ао, стс, отчет, роялти...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-D...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 2091, 476, 975, 476, 975, 3682, 19158, 96,..."
3998,SV990121573_ОРГ__INM_18'45'08_3.pdf,"[l, экземпляр, аостс, стс, отчет, п, роялти, 4...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 2091, 32885, 975, 3682, 163, 19158, 97, 66..."


In [8]:
def focal_loss(gamma=2., alpha=1.):

    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {4.0})

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)

        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    return focal_loss_fixed

##LSTM

In [32]:
MAX_WORDS = len(word_to_ix)
EMBEDDING_LENGTH = 1024
MAX_SEQUENCE_LENGTH = np.max(df['int_tokens'].apply(len))
HIDDEN_SIZE = 100

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_LENGTH, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
# model.add(Dense(50))
model.add(TimeDistributed(Dense(len(label_list), activation="softmax")))


model.compile(Adam(lr=0.005), loss=focal_loss(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 943, 1024)         68114432  
_________________________________________________________________
lstm_12 (LSTM)               (None, 943, 100)          450000    
_________________________________________________________________
time_distributed_13 (TimeDis (None, 943, 16)           1616      
Total params: 68,566,048
Trainable params: 68,566,048
Non-trainable params: 0
_________________________________________________________________


In [33]:
callbacks = [EarlyStopping(monitor='val_acc', patience=3),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_acc', save_best_only=True)]

x_train = pad_sequences(train['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH)
y_train = pad_sequences(train['encoded_ner_tags'], padding='post', maxlen=MAX_SEQUENCE_LENGTH, value=tag_to_idx['O'])
y_train = to_categorical(y_train, num_classes=len(label_list))
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    batch_size=200,
    callbacks=callbacks,
    validation_split=0.1,
)

model.load_weights('best_model.h5')

Train on 2880 samples, validate on 320 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


In [34]:
preds = model.predict(pad_sequences(test['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH))
tag_preds = [np.argmax(pred, axis=1) for pred in preds]
truncated_preds = []
for i in range(len(tag_preds)):
    truncated_preds.append(tag_preds[i][:len(test['encoded_ner_tags'].iloc[i])])
named_preds = []
for i in truncated_preds:
    named_preds.append([])
    for j in i:
        named_preds[-1].append(label_list[j])
print(metrics.flat_classification_report(test['ner_tags'], named_preds, digits=4, labels=labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

 B-DOCAGRDATE     0.8041    0.5200    0.6316       150
 I-DOCAGRDATE     0.7083    0.5795    0.6375        88
  B-DOCAGRNUM     0.7700    0.4302    0.5520       179
  I-DOCAGRNUM     1.0000    0.5714    0.7273         7
  B-DOCAMOUNT     0.9103    0.4570    0.6085      1243
  I-DOCAMOUNT     0.8013    0.3903    0.5249       620
    B-DOCCPTY     0.9331    0.7731    0.8456       595
    I-DOCCPTY     0.9351    0.7926    0.8580       564
 B-DOCCPTYINN     0.9845    0.6738    0.8000       282
B-DOCCUSTOMER     0.9329    0.9360    0.9344       609
I-DOCCUSTOMER     0.9403    0.9199    0.9300       462
    B-DOCDATE     0.9027    0.7767    0.8350       645
    I-DOCDATE     0.9185    0.8942    0.9062       605
     B-DOCNUM     0.7925    0.5223    0.6296       651
     I-DOCNUM     0.0000    0.0000    0.0000         0

    micro avg     0.8990    0.6707    0.7683      6700
    macro avg     0.8222    0.6158    0.6947      6700
 weighte

##LSTM CRF

In [12]:
MAX_WORDS = len(word_to_ix)
EMBEDDING_LENGTH = 1024
MAX_SEQUENCE_LENGTH = np.max(df['int_tokens'].apply(len))
HIDDEN_SIZE = 100

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_LENGTH, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
# model.add(Dense(50))
model.add(TimeDistributed(Dense(len(label_list), activation="relu")))
crf = CRF(len(label_list), learn_mode='marginal')
model.add(crf)

model.compile(Adam(lr=0.005), loss=focal_loss(), metrics=[crf_marginal_accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 943, 1024)         68114432  
_________________________________________________________________
lstm_2 (LSTM)                (None, 943, 100)          450000    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 943, 16)           1616      
_________________________________________________________________
crf_2 (CRF)                  (None, 943, 16)           560       
Total params: 68,566,608
Trainable params: 68,566,608
Non-trainable params: 0
_________________________________________________________________


In [13]:
callbacks = [EarlyStopping(monitor='val_crf_marginal_accuracy', patience=3),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_crf_marginal_accuracy', save_best_only=True)]

x_train = pad_sequences(train['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH)
y_train = pad_sequences(train['encoded_ner_tags'], padding='post', maxlen=MAX_SEQUENCE_LENGTH, value=tag_to_idx['O'])
y_train = to_categorical(y_train, num_classes=len(label_list))
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    batch_size=200,
    callbacks=callbacks,
    validation_split=0.1,
)

model.load_weights('best_model.h5')

Train on 2880 samples, validate on 320 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


In [14]:
preds = model.predict(pad_sequences(test['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH))
tag_preds = [np.argmax(pred, axis=1) for pred in preds]
truncated_preds = []
for i in range(len(tag_preds)):
    truncated_preds.append(tag_preds[i][:len(test['encoded_ner_tags'].iloc[i])])
named_preds = []
for i in truncated_preds:
    named_preds.append([])
    for j in i:
        named_preds[-1].append(label_list[j])
print(metrics.flat_classification_report(test['ner_tags'], named_preds, digits=4, labels=labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

 B-DOCAGRDATE     0.7700    0.5133    0.6160       150
 I-DOCAGRDATE     0.7656    0.5568    0.6447        88
  B-DOCAGRNUM     0.8706    0.4134    0.5606       179
  I-DOCAGRNUM     0.0000    0.0000    0.0000         7
  B-DOCAMOUNT     0.8308    0.4425    0.5774      1243
  I-DOCAMOUNT     0.7986    0.3774    0.5126       620
    B-DOCCPTY     0.9378    0.7597    0.8394       595
    I-DOCCPTY     0.9112    0.8191    0.8627       564
 B-DOCCPTYINN     0.9949    0.6915    0.8159       282
B-DOCCUSTOMER     0.9356    0.9064    0.9208       609
I-DOCCUSTOMER     0.9446    0.9221    0.9332       462
    B-DOCDATE     0.8998    0.7659    0.8275       645
    I-DOCDATE     0.9100    0.8860    0.8978       605
     B-DOCNUM     0.8925    0.5484    0.6794       651
     I-DOCNUM     0.0000    0.0000    0.0000         0

    micro avg     0.8973    0.6654    0.7641      6700
    macro avg     0.7641    0.5735    0.6459      6700
 weighte

##BiLSTM

In [23]:
MAX_WORDS = len(word_to_ix)
EMBEDDING_LENGTH = 1024
MAX_SEQUENCE_LENGTH = np.max(df['int_tokens'].apply(len))
HIDDEN_SIZE = 100

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_LENGTH, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True)))
# model.add(Dense(50))
model.add(TimeDistributed(Dense(len(label_list), activation="softmax")))


model.compile(Adam(lr=0.005), loss=focal_loss(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 943, 1024)         68114432  
_________________________________________________________________
bidirectional_7 (Bidirection (None, 943, 200)          900000    
_________________________________________________________________
time_distributed_9 (TimeDist (None, 943, 16)           3216      
Total params: 69,017,648
Trainable params: 69,017,648
Non-trainable params: 0
_________________________________________________________________


In [24]:
callbacks = [EarlyStopping(monitor='val_acc', patience=3),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_acc', save_best_only=True)]

x_train = pad_sequences(train['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH)
y_train = pad_sequences(train['encoded_ner_tags'], padding='post', maxlen=MAX_SEQUENCE_LENGTH, value=tag_to_idx['O'])
y_train = to_categorical(y_train, num_classes=len(label_list))
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    batch_size=200,
    callbacks=callbacks,
    validation_split=0.1,
)

model.load_weights('best_model.h5')

Train on 2880 samples, validate on 320 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30


In [25]:
preds = model.predict(pad_sequences(test['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH))
tag_preds = [np.argmax(pred, axis=1) for pred in preds]
truncated_preds = []
for i in range(len(tag_preds)):
    truncated_preds.append(tag_preds[i][:len(test['encoded_ner_tags'].iloc[i])])
named_preds = []
for i in truncated_preds:
    named_preds.append([])
    for j in i:
        named_preds[-1].append(label_list[j])
print(metrics.flat_classification_report(test['ner_tags'], named_preds, digits=4, labels=labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

 B-DOCAGRDATE     0.8317    0.5600    0.6693       150
 I-DOCAGRDATE     0.7241    0.7159    0.7200        88
  B-DOCAGRNUM     0.8929    0.4190    0.5703       179
  I-DOCAGRNUM     1.0000    0.8571    0.9231         7
  B-DOCAMOUNT     0.8679    0.5921    0.7040      1243
  I-DOCAMOUNT     0.7757    0.4742    0.5886       620
    B-DOCCPTY     0.9365    0.7933    0.8590       595
    I-DOCCPTY     0.9432    0.8245    0.8798       564
 B-DOCCPTYINN     0.9896    0.6773    0.8042       282
B-DOCCUSTOMER     0.9609    0.9688    0.9648       609
I-DOCCUSTOMER     0.9540    0.9437    0.9489       462
    B-DOCDATE     0.9287    0.8078    0.8640       645
    I-DOCDATE     0.9191    0.9207    0.9199       605
     B-DOCNUM     0.8938    0.5561    0.6856       651
     I-DOCNUM     0.0000    0.0000    0.0000         0

    micro avg     0.9090    0.7242    0.8061      6700
    macro avg     0.8412    0.6740    0.7401      6700
 weighte

##BiLSTM CRF

In [26]:
MAX_WORDS = len(word_to_ix)
EMBEDDING_LENGTH = 1024
MAX_SEQUENCE_LENGTH = np.max(df['int_tokens'].apply(len))
HIDDEN_SIZE = 100

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_LENGTH, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True)))
# model.add(Dense(50))
model.add(TimeDistributed(Dense(len(label_list), activation="relu")))
crf = CRF(len(label_list), learn_mode='marginal')
model.add(crf)

model.compile(Adam(lr=0.005), loss=focal_loss(), metrics=[crf_marginal_accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 943, 1024)         68114432  
_________________________________________________________________
bidirectional_8 (Bidirection (None, 943, 200)          900000    
_________________________________________________________________
time_distributed_10 (TimeDis (None, 943, 16)           3216      
_________________________________________________________________
crf_5 (CRF)                  (None, 943, 16)           560       
Total params: 69,018,208
Trainable params: 69,018,208
Non-trainable params: 0
_________________________________________________________________


In [27]:
callbacks = [EarlyStopping(monitor='val_crf_marginal_accuracy', patience=3),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_crf_marginal_accuracy', save_best_only=True)]

x_train = pad_sequences(train['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH)
y_train = pad_sequences(train['encoded_ner_tags'], padding='post', maxlen=MAX_SEQUENCE_LENGTH, value=tag_to_idx['O'])
y_train = to_categorical(y_train, num_classes=len(label_list))
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    batch_size=200,
    callbacks=callbacks,
    validation_split=0.1,
)

model.load_weights('best_model.h5')

Train on 2880 samples, validate on 320 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30


In [28]:
preds = model.predict(pad_sequences(test['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH))
tag_preds = [np.argmax(pred, axis=1) for pred in preds]
truncated_preds = []
for i in range(len(tag_preds)):
    truncated_preds.append(tag_preds[i][:len(test['encoded_ner_tags'].iloc[i])])
named_preds = []
for i in truncated_preds:
    named_preds.append([])
    for j in i:
        named_preds[-1].append(label_list[j])
print(metrics.flat_classification_report(test['ner_tags'], named_preds, digits=4, labels=labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

 B-DOCAGRDATE     0.8532    0.6200    0.7181       150
 I-DOCAGRDATE     0.7442    0.7273    0.7356        88
  B-DOCAGRNUM     0.9211    0.3911    0.5490       179
  I-DOCAGRNUM     1.0000    0.8571    0.9231         7
  B-DOCAMOUNT     0.8497    0.6002    0.7034      1243
  I-DOCAMOUNT     0.8320    0.5032    0.6271       620
    B-DOCCPTY     0.9316    0.8017    0.8618       595
    I-DOCCPTY     0.9222    0.8404    0.8794       564
 B-DOCCPTYINN     1.0000    0.7021    0.8250       282
B-DOCCUSTOMER     0.9639    0.9655    0.9647       609
I-DOCCUSTOMER     0.9586    0.9524    0.9555       462
    B-DOCDATE     0.9156    0.8078    0.8583       645
    I-DOCDATE     0.9262    0.9124    0.9192       605
     B-DOCNUM     0.8959    0.5684    0.6955       651
     I-DOCNUM     0.0000    0.0000    0.0000         0

    micro avg     0.9093    0.7330    0.8117      6700
    macro avg     0.8476    0.6833    0.7477      6700
 weighte

Binary Models

In [72]:
labels[8:9]

['B-DOCCPTYINN']

In [12]:
models = []
MAX_WORDS = len(word_to_ix)
EMBEDDING_LENGTH = 512
MAX_SEQUENCE_LENGTH = np.max(df['int_tokens'].apply(len))
HIDDEN_SIZE = 100

for label in labels[8:9]:
    model = Sequential()
    model.add(Embedding(MAX_WORDS, EMBEDDING_LENGTH, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True)))
    # model.add(Dense(50))
    model.add(TimeDistributed(Dense(2, activation="relu")))
    crf = CRF(2, learn_mode='marginal')
    model.add(crf)
    # model.add(Dense(1, activation='sigmoid'))
    # model.add(Dense(1, activation='tanh'))

    model.compile(Adam(lr=0.2), loss=focal_loss(), metrics=[crf_marginal_accuracy])
    
    callbacks = [EarlyStopping(monitor='val_crf_marginal_accuracy', patience=20),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_crf_marginal_accuracy', save_best_only=True)]

    id = tag_to_idx[label]
    mask = train['ner_tags'].apply(lambda x: True if label in x else False)
    x_train = pad_sequences(train['int_tokens'][mask], padding='post', maxlen=MAX_SEQUENCE_LENGTH)
    y_tmp = train['encoded_ner_tags'][mask].apply(lambda x: [1 if t == id else 0 for t in x])
    y_train = pad_sequences(y_tmp, padding='post', maxlen=MAX_SEQUENCE_LENGTH, value=0)
    y_train = to_categorical(y_train, num_classes=2)
    class_weight={0: 1, 1: 50}
    history = model.fit(
        x=x_train,
        y=y_train,
        epochs=20,
        batch_size=200,
        callbacks=callbacks,
        validation_split=0.1,
    )

    model.load_weights('best_model.h5')
    # model.save_weights(label + '.h5')
    models.append(model)

Train on 862 samples, validate on 96 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
200/862 [=====>........................] - ETA: 12s - loss: 0.1741 - crf_marginal_accuracy: 0.9988

KeyboardInterrupt: ignored

In [24]:
labels[:1]

['B-DOCAGRDATE']

In [None]:
# for i in range(len(models)):
#     model = models[i]
#     label = labels[i]
#     cur_list = {0: 'O', 1: label}
#     preds = model.predict(pad_sequences(train['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH))
#     tag_preds = [np.argmax(pred, axis=1) for pred in preds]
#     truncated_preds = []
#     for i in range(len(tag_preds)):
#         truncated_preds.append(tag_preds[i][:len(train['encoded_ner_tags'].iloc[i])])
#     named_preds = []
#     for i in truncated_preds:
#         named_preds.append([])
#         for j in i:
#             named_preds[-1].append(cur_list[j])
#     test_tmp = train['ner_tags'].apply(lambda x: [t if t == label else 'O' for t in x])
#     print(metrics.flat_classification_report(test_tmp, named_preds, digits=4, labels=[label, 'O']))

In [None]:
for label in labels[9:10]:
    model = Sequential()
    model.add(Embedding(MAX_WORDS, EMBEDDING_LENGTH, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True)))
    # model.add(Dense(50))
    model.add(TimeDistributed(Dense(2, activation="relu")))
    crf = CRF(2, learn_mode='marginal')
    model.add(crf)
    # model.add(Dense(1, activation='sigmoid'))

    model.compile(Adam(lr=0.005), loss='binary_crossentropy', metrics=[crf_marginal_accuracy])
    
    callbacks = [EarlyStopping(monitor='val_crf_marginal_accuracy', patience=2),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_crf_marginal_accuracy', save_best_only=True)]

    id = tag_to_idx[label]

    x_train = pad_sequences(train['int_tokens'], padding='post', maxlen=MAX_SEQUENCE_LENGTH)
    y_tmp = train['encoded_ner_tags'].apply(lambda x: [1 if t == id else 0 for t in x])
    y_train = pad_sequences(y_tmp, padding='post', maxlen=MAX_SEQUENCE_LENGTH, value=0)
    y_train = to_categorical(y_train, num_classes=2)
    history = model.fit(
        x=x_train,
        y=y_train,
        epochs=15,
        batch_size=200,
        callbacks=callbacks,
        validation_split=0.1,
    )

    model.load_weights('best_model.h5')
    model.save_weights(label + '.h5')
    # models.append(model)

Train on 2880 samples, validate on 320 samples
Epoch 1/15
 200/2880 [=>............................] - ETA: 3:21 - loss: 1.8294 - crf_marginal_accuracy: 6.4687e-04

ResourceExhaustedError: ignored