In [6]:

import json
import numpy as np
import sklearn.model_selection
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# 读取label2id字典
with open("../vocab/sent_vocab.json", "r", encoding="utf-8") as f:
    entry = json.load(f)
word2idx = entry['word2id']

with open("../vocab/tag_vocab.json", "r", encoding="utf-8") as f:
    entry = json.load(f)
tag2idx = entry['tag2id']

MAX_SEQ_LEN = 128


# 将原始样本数据做初步处理
def read_data(file_path):
    # 读取数据集
    with open(file_path, "r", encoding="utf-8") as f:
        content = [line.strip() for line in f.readlines()]

    #根据空行判断每个句子的分割点，找出分割点存在index列表里
    index = [-1]
    #空行代表是一个句子的分割，一般行都是"相 O"形式，里面有一个空格；而空行只是一个""，没有空格
    index.extend([i for i, _ in enumerate(content) if ' ' not in _])
    index.append(len(content))
  
    # 根据上述找好的每句话分割点的index位置，一句一句找出word和对应的tag
    sentences, tags = [], []
    for j in range(len(index)-1):
        word, tag = [], []
        #一个sentence就代表一句话
        sentence = content[index[j]+1: index[j+1]]
        for line in sentence:
            word.append(line.split()[0])
            tag.append(line.split()[-1])
        #句子原文用字符串形式连在一起
        sentences.append(''.join(word))
        #一句话的所有tag组成一个list，最后再统一放到大的tags列表里
        tags.append(tag)

    # 去除空的句子及标注序列，一般放在末尾
    sentences = [s for s in sentences if s]
    tags = [t for t in tags if t]

    return sentences, tags

sentences, tags = read_data('./example.test')


# count = word2idx[list(word2idx)[-1]]
# print(count)

# with open('entity.txt', 'w', encoding='utf-8') as file:
#     for sentence in sentences:
#         for word in sentence:
#             if word not in word2idx.keys():
#                 count = count+1
#                 file.write(f"\"{word}\": {count},\n")







X = [[word2idx[w] for w in sentence] for sentence in sentences]
y = [[tag2idx[t] for t in tag] for tag in tags]

# 最大长度为特征维度
MAX_SEQ_LEN = max([len(x) for x in X])

X = pad_sequences(maxlen=MAX_SEQ_LEN, sequences=X, padding="post", value=0)
y = pad_sequences(maxlen=MAX_SEQ_LEN, sequences=y, padding="post", value=tag2idx["O"])

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)
# print(X)
# print(y)
# print(len(X), len(X[0]))
# print(len(y), len(y[0]))
# print(X_train, X_test, y_train, y_test)
print(len(X_train), len(X_test), len(y_train), len(y_test))
X_train = X_train[:100]
X_test = X_test[:10]
y_test = y_test[:10]

4172 464 4172 464


In [8]:
import tensorflow as tf
import time
from keras import Input, Model
from keras_contrib.layers import CRF
from keras.layers import Dense, Activation, Dropout, LSTM, GRU, Embedding, Bidirectional, TimeDistributed, Reshape
from keras.models import Sequential, load_model

def build_model(max_features, max_sentence_len, num_tag, embedding_size, hidden_size):
    print(max_features, max_sentence_len, num_tag, embedding_size, hidden_size)
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=max_sentence_len))
    model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, recurrent_dropout=0.1)))
    model.add(TimeDistributed(Dense(hidden_size, activation="relu")))
    # crf_layer = CRF(num_tag, sparse_target=True)
    # model.add(crf_layer)
    # model.compile(loss=crf_layer.loss_function, optimizer='adam', metrics=[crf_layer.accuracy])
    model.add(Reshape((-1,)))
    model.add(Dense(num_tag, activation="softmax"))  # 输出层大小等于标签的数量
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model
# Build model


def train():
    embedding_size = 24
    hidden_size = 24
    model = build_model(len(word2idx), MAX_SEQ_LEN, len(tag2idx), embedding_size, hidden_size)
    # model.summary()
    BATCH_SIZE=10 # 训练很慢，所以提高批次大小
    EPOCHS=10 # 训练代数
    history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.05)
    model.save(f"model.keras")

train()

4554 577 13 24 24
Epoch 1/10


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 577), output.shape=(None, 13)

In [7]:
# model = load_model('model/model20240514-153307.keras')
model = load_model('model.keras')
results = model.evaluate(X_test, y_test)
print("test loss, test acc:", results)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 71861.1094
test loss, test acc: [71861.109375, 0.0]
