In [8]:
#! -*- coding: utf-8 -*-
# 2019年百度的实体链指比赛（ ccks2019，https://biendata.com/competition/ccks_2019_el/ ），一个baseline

import json
from tqdm import tqdm
import os
import numpy as np
from random import choice
from itertools import groupby
import sys  


mode = 0
min_count = 2
char_size = 128


id2kb = {}
with open('../ccks2019_el/kb_data') as f:
    for l in tqdm(f):
        _ = json.loads(l)
        subject_id = _['subject_id']
        subject_alias = list(set([_['subject']] + _.get('alias', [])))
        subject_alias = [alias.lower() for alias in subject_alias]
        subject_desc = '\n'.join(u'%s：%s' % (i['predicate'], i['object']) for i in _['data'])
        subject_desc = subject_desc.lower()
        if subject_desc:
            id2kb[subject_id] = {'subject_alias': subject_alias, 'subject_desc': subject_desc}


kb2id = {}
for i,j in id2kb.items():
    for k in j['subject_alias']:
        if k not in kb2id:
            kb2id[k] = []
        kb2id[k].append(i)


train_data = []
with open('../ccks2019_el/train.json') as f:
    for l in tqdm(f):
        _ = json.loads(l)
        train_data.append({
            'text': _['text'].lower(),
            'mention_data': [(x['mention'].lower(), int(x['offset']), x['kb_id'])
                for x in _['mention_data'] if x['kb_id'] != 'NIL'
            ]
        })


if not os.path.exists('../all_chars_me.json'):
    chars = {}
    for d in tqdm(iter(id2kb.values())):
        for c in d['subject_desc']:
            chars[c] = chars.get(c, 0) + 1  #exist +1 else 1
    for d in tqdm(iter(train_data)):
        for c in d['text']:
            chars[c] = chars.get(c, 0) + 1
    chars = {i:j for i,j in chars.items() if j >= min_count}
    id2char = {i+2:j for i,j in enumerate(chars)} # 0: mask, 1: padding   +2???
    char2id = {j:i for i,j in id2char.items()}
    json.dump([id2char, char2id], open('../all_chars_me.json', 'w', encoding='utf-8'))
else:
    id2char, char2id = json.load(open('../all_chars_me.json'))

if not os.path.exists('../random_order_train.json'):
    random_order = list(range(len(train_data)))
    np.random.shuffle(random_order)
    json.dump(
        random_order,
        open('../random_order_train.json', 'w', encoding='utf-8'),
        indent=4
    )
else:
    random_order = json.load(open('../random_order_train.json'))


dev_data = [train_data[j] for i, j in enumerate(random_order) if i % 9 == mode]
train_data = [train_data[j] for i, j in enumerate(random_order) if i % 9 != mode]


def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])


class data_generator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:                #?????
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, S1, S2, Y, T = [], [], [], [], [], []
            for i in idxs:
                d = self.data[i]
                text = d['text']
                x1 = [char2id.get(c, 1) for c in text]
                s1, s2 = np.zeros(len(text)), np.zeros(len(text))
                mds = {}
                for md in d['mention_data']:
                    if md[0] in kb2id:
                        j1 = md[1]
                        j2 = j1 + len(md[0])
                        s1[j1] = 1
                        s2[j2 - 1] = 1
                        mds[(j1, j2)] = (md[0], md[2])
                if mds:
                    j1, j2 = choice(list(mds.keys()))
                    y = np.zeros(len(text))
                    y[j1: j2] = 1
                    x2 = choice(kb2id[mds[(j1, j2)][0]])
                    if x2 == mds[(j1, j2)][1]:
                        t = [1]
                    else:
                        t = [0]
                    x2 = id2kb[x2]['subject_desc']
                    x2 = [char2id.get(c, 1) for c in x2]
                    X1.append(x1)
                    X2.append(x2)
                    S1.append(s1)
                    S2.append(s2)
                    Y.append(y)
                    T.append(t)
                    if len(X1) == self.batch_size or i == idxs[-1]:
                        X1 = seq_padding(X1)
                        X2 = seq_padding(X2)
                        S1 = seq_padding(S1)
                        S2 = seq_padding(S2)
                        Y = seq_padding(Y)
                        T = seq_padding(T)
                        yield [X1, X2, S1, S2, Y, T], None
                        X1, X2, S1, S2, Y, T = [], [], [], [], [], []


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam


def seq_maxpool(x):
    """seq是[None, seq_len, s_size]的格式，
    mask是[None, seq_len, 1]的格式，先除去mask部分，
    然后再做maxpooling。
    """
    seq, mask = x
    seq -= (1 - mask) * 1e10
    return K.max(seq, 1)


x1_in = Input(shape=(None,)) # 待识别句子输入
x2_in = Input(shape=(None,)) # 实体语义表达输入
s1_in = Input(shape=(None,)) # 实体左边界（标签）
s2_in = Input(shape=(None,)) # 实体右边界（标签）
y_in = Input(shape=(None,)) # 实体标记
t_in = Input(shape=(1,)) # 是否有关联（标签）


x1, x2, s1, s2, y, t = x1_in, x2_in, s1_in, s2_in, y_in, t_in
x1_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1)
x2_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x2)

embedding = Embedding(len(id2char)+2, char_size)


x1 = embedding(x1)
x1 = Dropout(0.2)(x1)
x1 = Lambda(lambda x: x[0] * x[1])([x1, x1_mask])
x1 = Bidirectional(CuDNNLSTM(char_size//2, return_sequences=True))(x1)
x1 = Lambda(lambda x: x[0] * x[1])([x1, x1_mask])
x1 = Bidirectional(CuDNNLSTM(char_size//2, return_sequences=True))(x1)
x1 = Lambda(lambda x: x[0] * x[1])([x1, x1_mask])

h = Conv1D(char_size, 3, activation='relu', padding='same')(x1)
ps1 = Dense(1, activation='sigmoid')(h)
ps2 = Dense(1, activation='sigmoid')(h)

s_model = Model(x1_in, [ps1, ps2])


y = Lambda(lambda x: K.expand_dims(x, 2))(y)
x1 = Concatenate()([x1, y])
x1 = Conv1D(char_size, 3, padding='same')(x1)

x2 = embedding(x2)
x2 = Dropout(0.2)(x2)
x2 = Lambda(lambda x: x[0] * x[1])([x2, x2_mask])
x2 = Bidirectional(CuDNNLSTM(char_size//2, return_sequences=True))(x2)
x2 = Lambda(lambda x: x[0] * x[1])([x2, x2_mask])
x2 = Bidirectional(CuDNNLSTM(char_size//2, return_sequences=True))(x2)
x2 = Lambda(lambda x: x[0] * x[1])([x2, x2_mask])

x1 = Lambda(seq_maxpool)([x1, x1_mask])
x2 = Lambda(seq_maxpool)([x2, x2_mask])
x12 = Multiply()([x1, x2])
x = Concatenate()([x1, x2, x12])
x = Dense(char_size, activation='relu')(x)
pt = Dense(1, activation='sigmoid')(x)

t_model = Model([x1_in, x2_in, y_in], pt)


train_model = Model([x1_in, x2_in, s1_in, s2_in, y_in, t_in],
                    [ps1, ps2, pt])

s1 = K.expand_dims(s1, 2)
s2 = K.expand_dims(s2, 2)

s1_loss = K.binary_crossentropy(s1, ps1)
s1_loss = K.sum(s1_loss * x1_mask) / K.sum(x1_mask)
s2_loss = K.binary_crossentropy(s2, ps2)
s2_loss = K.sum(s2_loss * x1_mask) / K.sum(x1_mask)
pt_loss = K.mean(K.binary_crossentropy(t, pt))

loss = s1_loss + s2_loss + pt_loss

train_model.add_loss(loss)
train_model.compile(optimizer=Adam(1e-3))
train_model.summary()


def extract_items(text_in):
    _x1 = [char2id.get(c, 1) for c in text_in]
    _x1 = np.array([_x1])
    _k1, _k2 = s_model.predict(_x1)
    _k1, _k2 = _k1[0, :, 0], _k2[0, :, 0]
    _k1, _k2 = np.where(_k1 > 0.5)[0], np.where(_k2 > 0.5)[0]
    _subjects = []
    for i in _k1:
        j = _k2[_k2 >= i]
        if len(j) > 0:
            j = j[0]
            _subject = text_in[i: j+1]
            _subjects.append((_subject, i, j))
    if _subjects:
        R = []
        _X2, _Y = [], []
        _S, _IDXS = [], {}
        for _s in _subjects:
            _y = np.zeros(len(text_in))
            _y[_s[1]: _s[2]] = 1
            _IDXS[_s] = kb2id.get(_s[0], [])
            for i in _IDXS[_s]:
                _x2 = id2kb[i]['subject_desc']
                _x2 = [char2id.get(c, 1) for c in _x2]
                _X2.append(_x2)
                _Y.append(_y)
                _S.append(_s)
        if _X2:
            _X2 = seq_padding(_X2)
            _Y = seq_padding(_Y)
            _X1 = np.repeat(_x1, len(_X2), 0)
            scores = t_model.predict([_X1, _X2, _Y])[:, 0]
            for k, v in groupby(zip(_S, scores), key=lambda s: s[0]):
                v = np.array([j[1] for j in v])
                kbid = _IDXS[k][np.argmax(v)]
                R.append((k[0], k[1], kbid))
        return R
    else:
        return []


class Evaluate(Callback):
    def __init__(self):
        self.F1 = []
        self.best = 0.
    def on_epoch_end(self, epoch, logs=None):
        f1, precision, recall = self.evaluate()
        self.F1.append(f1)
        if f1 > self.best:
            self.best = f1
            train_model.save_weights('best_model.weights')
        print('f1: %.4f, precision: %.4f, recall: %.4f, best f1: %.4f\n' % (f1, precision, recall, self.best))
    def evaluate(self):
        A, B, C = 1e-10, 1e-10, 1e-10
        for d in tqdm(iter(dev_data)):
            R = set(extract_items(d['text']))
            T = set(d['mention_data'])
            A += len(R & T)
            B += len(R)
            C += len(T)
        return 2 * A / (B + C), A / B, A / C


#evaluator = Evaluate()
train_D = data_generator(train_data)

'''train_model.fit_generator(train_D.__iter__(),
                          steps_per_epoch=len(train_D),
                          epochs=1,
                          callbacks=[evaluator]
                         )
'''
train_model.load_weights('./best_model.weights')
for i in range(10):
    d = dev_data[-i-1]
    print(d)
    print(u'预测结果：', extract_items(d['text']))
    print('\n')



399252it [00:09, 43851.66it/s]
90000it [00:01, 70123.15it/s]
Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    1391104     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 128)    0           embedding_1[0][0]                
_____________________

{'text': '封面人物·杨秀国-新闻爱好者2011年18期', 'mention_data': [('杨秀国', 5, '202107'), ('新闻爱好者', 9, '282818')]}
预测结果： [('人物', 2, '67464'), ('杨秀', 5, '36922'), ('爱好', 11, '338501'), ('01', 15, '264600')]


{'text': '《裸体午餐》完整版-电影-在线观看', 'mention_data': [('裸体午餐', 1, '202906'), ('电影', 10, '148097')]}
预测结果： [('午餐', 3, '402197'), ('观看', 15, '108366')]


{'text': '《追剿》第11集 - 高清正版在线观看', 'mention_data': [('追剿', 1, '34952')]}
预测结果： []


{'text': '小雪花 幼儿儿歌舞蹈教学-音乐-高清视频', 'mention_data': [('小雪花', 0, '75308'), ('幼儿', 4, '298131'), ('儿歌', 6, '140447'), ('舞蹈教学', 8, '16451'), ('音乐', 13, '130934'), ('高清视频', 16, '338774')]}
预测结果： [('小雪', 0, '307388'), ('教学', 10, '18151')]


{'text': 'death》,纪实一般的非洲丧尸片。中文《尸地余生》', 'mention_data': [('纪实', 7, '334261'), ('非洲', 12, '77501'), ('丧尸片', 14, '141394'), ('尸地余生', 21, '54842')]}
预测结果： [('纪实', 7, '201167'), ('一', 9, '264754')]


{'text': '自驾从许昌去舞钢二郎山怎么走?_舞钢吧', 'mention_data': [('自驾', 0, '154039'), ('许昌', 3, '333215'), ('舞钢二郎山', 6, '177284'), ('舞钢吧', 16, '81994')]}
预测结果： []


{'tex

In [27]:
submit_raw

{'text': '王杰一首《原来的我》超级好听!', 'text_id': '1300'}


1291it [00:36, 48.81it/s][A

In [28]:
r

[('在', 14, '57377')]

In [30]:
submit_result = []
with open('../ccks2019_el/develop.json','r') as f:
    for l in tqdm(f):
        submit_raw = json.loads(l)
        r = extract_items(submit_raw["text"])
        result["text_id"] = submit_raw["text_id"]
        result["text"] = submit_raw["text"]
        result["mention_data"] = []
        for o in r:
            a_result = {}
            a_result["kb_id"] = o[2]
            a_result["mention"] = o[0]
            a_result["offset"] = o[1]
            result["mention_data"].append(a_result)
        submit_result.append(result)
    with open("./submit0504.json", 'w+') as j:
        j.write(json.dumps(submit_result, ensure_ascii=False))



0it [00:00, ?it/s][A[A

5it [00:00, 41.94it/s][A[A

12it [00:00, 44.48it/s][A[A

17it [00:00, 45.62it/s][A[A

25it [00:00, 52.26it/s][A[A

41it [00:00, 60.18it/s][A[A

48it [00:00, 55.76it/s][A[A

56it [00:00, 60.14it/s][A[A

63it [00:01, 48.72it/s][A[A

70it [00:01, 50.36it/s][A[A

76it [00:01, 43.83it/s][A[A

83it [00:01, 34.31it/s][A[A

91it [00:01, 34.24it/s][A[A

98it [00:02, 40.24it/s][A[A

105it [00:02, 45.73it/s][A[A

119it [00:02, 56.41it/s][A[A

127it [00:02, 57.39it/s][A[A

136it [00:02, 64.25it/s][A[A

144it [00:02, 66.39it/s][A[A

152it [00:02, 68.21it/s][A[A

161it [00:02, 72.75it/s][A[A

176it [00:02, 77.30it/s][A[A

185it [00:03, 62.56it/s][A[A

195it [00:03, 70.04it/s][A[A

203it [00:03, 65.97it/s][A[A

215it [00:03, 74.22it/s][A[A

227it [00:03, 78.52it/s][A[A

236it [00:03, 66.87it/s][A[A

247it [00:03, 75.51it/s][A[A

261it [00:04, 84.57it/s][A[A

271it [00:04, 64.08it/s][A[A

279it [00:04, 65.83it/s][

2496it [00:40, 59.94it/s][A[A

2505it [00:40, 57.00it/s][A[A

2516it [00:40, 65.22it/s][A[A

2524it [00:40, 58.52it/s][A[A

2533it [00:40, 65.29it/s][A[A

2543it [00:40, 67.65it/s][A[A

2551it [00:41, 67.32it/s][A[A

2562it [00:41, 71.71it/s][A[A

2574it [00:41, 80.37it/s][A[A

2583it [00:41, 81.84it/s][A[A

2592it [00:41, 67.95it/s][A[A

2600it [00:41, 53.95it/s][A[A

2611it [00:41, 63.54it/s][A[A

2622it [00:42, 68.20it/s][A[A

2634it [00:42, 77.85it/s][A[A

2643it [00:42, 63.39it/s][A[A

2651it [00:42, 66.11it/s][A[A

2662it [00:42, 49.89it/s][A[A

2669it [00:43, 35.38it/s][A[A

2679it [00:43, 43.13it/s][A[A

2690it [00:43, 51.95it/s][A[A

2698it [00:43, 47.86it/s][A[A

2705it [00:43, 44.83it/s][A[A

2711it [00:43, 45.48it/s][A[A

2717it [00:44, 42.20it/s][A[A

2733it [00:44, 53.60it/s][A[A

2741it [00:44, 47.57it/s][A[A

2756it [00:44, 59.12it/s][A[A

2765it [00:44, 65.56it/s][A[A

2782it [00:44, 79.19it/s][A[A

2793it [00

5077it [01:18, 65.97it/s][A[A

5088it [01:18, 72.23it/s][A[A

5098it [01:19, 63.74it/s][A[A

5107it [01:19, 69.66it/s][A[A

5115it [01:19, 69.95it/s][A[A

5123it [01:19, 66.63it/s][A[A

5131it [01:19, 64.81it/s][A[A

5142it [01:19, 72.25it/s][A[A

5151it [01:19, 75.60it/s][A[A

5159it [01:19, 61.20it/s][A[A

5169it [01:20, 64.81it/s][A[A

5187it [01:20, 79.24it/s][A[A

5201it [01:20, 90.06it/s][A[A

5212it [01:20, 86.81it/s][A[A

5226it [01:20, 97.77it/s][A[A

5238it [01:20, 83.46it/s][A[A

5248it [01:20, 74.50it/s][A[A

5257it [01:21, 65.49it/s][A[A

5265it [01:21, 52.58it/s][A[A

5272it [01:21, 44.03it/s][A[A

5280it [01:21, 50.12it/s][A[A

5287it [01:21, 52.85it/s][A[A

5299it [01:21, 60.05it/s][A[A

5309it [01:21, 67.91it/s][A[A

5323it [01:22, 79.07it/s][A[A

5338it [01:22, 91.86it/s][A[A

5349it [01:22, 95.37it/s][A[A

5360it [01:22, 74.52it/s][A[A

5370it [01:22, 60.19it/s][A[A

5383it [01:22, 71.30it/s][A[A

5398it [01

7647it [01:58, 90.55it/s][A[A

7661it [01:58, 98.12it/s][A[A

7673it [01:58, 91.12it/s][A[A

7684it [01:58, 72.73it/s][A[A

7693it [01:58, 44.52it/s][A[A

7700it [01:59, 38.23it/s][A[A

7706it [01:59, 41.26it/s][A[A

7714it [01:59, 47.67it/s][A[A

7723it [01:59, 53.78it/s][A[A

7732it [01:59, 60.07it/s][A[A

7740it [01:59, 62.57it/s][A[A

7748it [01:59, 65.25it/s][A[A

7756it [02:00, 56.47it/s][A[A

7763it [02:00, 55.62it/s][A[A

7772it [02:00, 62.65it/s][A[A

7779it [02:00, 54.02it/s][A[A

7787it [02:00, 59.03it/s][A[A

7795it [02:00, 63.96it/s][A[A

7804it [02:00, 69.77it/s][A[A

7814it [02:00, 73.71it/s][A[A

7822it [02:01, 70.30it/s][A[A

7830it [02:01, 68.52it/s][A[A

7838it [02:01, 25.82it/s][A[A

7847it [02:02, 32.82it/s][A[A

7858it [02:02, 41.07it/s][A[A

7871it [02:02, 49.39it/s][A[A

7883it [02:02, 59.33it/s][A[A

7899it [02:02, 68.40it/s][A[A

7909it [02:02, 63.49it/s][A[A

7919it [02:02, 70.72it/s][A[A

7928it [02

TypeError: 3 is not JSON serializable