In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from keras import layers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential, Input
import keras

from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy

import matplotlib.pyplot as plt
import seaborn as sns

import json

gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

Using TensorFlow backend.


In [2]:
WORD_NUM = 60
embedding_len = 1559
embedding_size = 200
n_tag = 9

targ_idx = {
    0: 'O',
    1: 'B_LOC',
    2: 'I_LOC',
    3: 'B_ORG',
    4: 'I_ORG',
    5: 'B_PRO',
    6: 'I_PRO',
    7: 'B_T',
    8: 'I_T'
}

In [3]:
inp = layers.Input(shape=(WORD_NUM, ))
x = layers.Embedding(embedding_len, embedding_size, input_length=WORD_NUM)(inp)
x = layers.Bidirectional(layers.LSTM(embedding_size, return_sequences=True))(x)

x = layers.TimeDistributed(layers.Dense(n_tag, activation="relu"))(x)
x = CRF(n_tag, sparse_target=True)(x)

model = Model(inputs=inp, outputs=x)

model.compile(loss=crf_loss,
              optimizer='adam',
              metrics=[crf_accuracy])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 60, 200)           311800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 60, 400)           641600    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 60, 9)             3609      
_________________________________________________________________
crf_1 (CRF)                  (None, 60, 9)             189       
Total params: 957,198
Trainable params: 957,198
Non-trainable params: 0
_________________________________________________________________


In [4]:
model.load_weights('model.h5')

In [5]:
with open('dictionary.json', 'r', encoding='utf8') as f:
    corpus = json.load(f)

In [6]:
new_string4 = '产业投资有限公司为项目业主，申报设立桂林荔浦保税物流中心（B型），选址位于荔浦市北部。四至范围：东以321国道为界，西接工业园区大道，南依金牛工业园区，北临长水岭工业园区。规划用地面积258.88亩。'

In [7]:
p = []
for i in list(new_string4):
    try:
        corpus[i]
        p.append(corpus[i])
    except:
         p.append(corpus['<UNK>'])

In [8]:
p

[213,
 48,
 376,
 377,
 76,
 283,
 81,
 284,
 102,
 241,
 242,
 48,
 285,
 78,
 273,
 143,
 201,
 123,
 36,
 18,
 1423,
 314,
 21,
 274,
 275,
 276,
 166,
 220,
 52,
 277,
 278,
 55,
 78,
 286,
 287,
 85,
 14,
 1423,
 314,
 113,
 261,
 301,
 60,
 87,
 88,
 98,
 99,
 50,
 256,
 66,
 45,
 39,
 41,
 75,
 334,
 102,
 26,
 78,
 3,
 616,
 125,
 48,
 292,
 8,
 313,
 334,
 78,
 296,
 703,
 361,
 868,
 125,
 48,
 292,
 8,
 78,
 261,
 823,
 326,
 202,
 820,
 125,
 48,
 292,
 8,
 60,
 140,
 146,
 298,
 83,
 23,
 24,
 39,
 44,
 272,
 267,
 272,
 272,
 299,
 60]

In [9]:
p = pad_sequences([p], maxlen=WORD_NUM, padding='post', truncating='post', value=0)

In [10]:
result = np.argmax(model.predict(p), axis=-1)

In [11]:
result

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0]], dtype=int64)

In [12]:
for i in result.flatten():
    print(targ_idx[i])

O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
B_ORG
I_ORG
I_ORG
I_ORG
I_ORG
I_ORG
I_ORG
I_ORG
I_ORG
I_ORG
O
O
O
O
O
O
O
O
O
B_LOC
I_LOC
I_LOC
I_LOC
I_LOC
O
O
O
O
O
O
O
O
O
O
O
I_LOC
I_LOC
O
O
O
O
O
