In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

In [3]:
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint
from keras import layers, models, optimizers
from keras.layers import LSTM
from keras.layers.merge import concatenate
from keras.utils import to_categorical
from keras.utils import np_utils

Using TensorFlow backend.


# 准备文本

In [4]:
filename = "/home/hxjiang/Pythonworkspace/patent/sample3_G-06-F-17/textual/after_process.xlsx"
data = pd.read_excel(filename, encoding='utf-8')

In [5]:
data.head(1)

Unnamed: 0,abstract,application_id,claims,claims_add1,claims_add2,location,title,abstract_sen_count,claims_sen_count,abstract_final,abstract_count,claims_final,claims_count
0,"['A', 'method', 'of', 'providing', 'a', 'secur...",11041610,"['1', '.', 'A', 'method', 'of', 'providing', '...",,,2010/ipa100107/US20100001069A1.xml,METHOD OF PRINTING SECURITY DOCUMENTS,4,19,"['A', 'method', 'provide', 'security', 'docume...",46,"['A', 'method', 'provide', 'security', 'docume...",622


In [7]:
result_data = pd.read_excel(r"/home/hxjiang/Pythonworkspace/patent/sample3_G-06-F-17/textual/2010_result.xlsx", encoding='utf-8')

In [8]:
result_data.head(1)

Unnamed: 0,application_id,inventor_num,inventor_patent_num,assignee_num,assignee_patent_num,claims_num,famliy,cpc_class,pa_country,pa_state,pa_city,result,filed_time,published_time,location
37,11041610,2.0,2684.0,1.0,48.0,19,0,G-06-F-17,AU,,BALMAIN,1,2005-01-25,2010-01-07,2010/ipa100107/US20100001069A1.xml


# 划分训练集和测试集

## abstract

In [6]:
x_abs_train = data['abstract_final'].iloc[:6215]
x_abs_valid = data['abstract_final'].iloc[6215:6992]
x_abs_test = data['abstract_final'].iloc[6992:]

## claims

In [8]:
x_claims_train = data['claims_final'].iloc[:6215]
x_claims_valid = data['claims_final'].iloc[6215:6992]
x_claims_test = data['claims_final'].iloc[6992:]

## 不平衡数据权重调整

In [9]:
train_target = np_utils.to_categorical(result_data[['result']], 2)
y_ints = [y.argmax() for y in train_target]
cw = class_weight.compute_class_weight('balanced', np.unique(y_ints), y_ints)
cw

array([1.49346405, 0.75164474])

In [10]:
train_target.shape

(7769, 2)

In [11]:
y_binary_train = result_data['result'][:6215]
y_binary_valid = result_data['result'][6215:6992]
y_binary_test = result_data['result'][6992:]

y_category_train = train_target[:6215]
y_category_valid = train_target[6215:6992]
y_category_test = train_target[6992:]

In [12]:
sum(y_category_train[:, 1])/sum(y_category_train[:, 0])

1.9765325670498084

# 以词向量为特征
词向量是一种利用稠密向量表示词或者文档的形式，词在向量空间中的位置从文本中学习得到并且以该词附近出现的词为学习依据。
词向量可以由输入语料自身学习得到或者可以利用预训练好的词向量生成，例如Glove，FastText和word2Vec。
预训练词向量有四个必要的步骤：
1. 加载预训练的词向量
2. 创建标记器对象
3. 将文本文档转换为词条序列并对其进行填补。
4. 创建词条与其对应的词向量之间的映射。

In [17]:
embeddings_index = {}
for line in tqdm(open('/home/hxjiang/Pythonworkspace/patent/sample3_G-06-F-17/textual/glove.6B.300d.word2vec.txt', 'r', encoding='UTF-8')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

400001it [00:36, 10816.36it/s]


## abstract

In [14]:
abstract_token = text.Tokenizer()
abstract_token.fit_on_texts(data['abstract'])
abstract_word_index = abstract_token.word_index
len(abstract_word_index)

17403

In [15]:
x_abs_train_seq = sequence.pad_sequences(abstract_token.texts_to_sequences(x_abs_train), maxlen=140)
x_abs_valid_seq = sequence.pad_sequences(abstract_token.texts_to_sequences(x_abs_valid), maxlen=140)
x_abs_test_seq = sequence.pad_sequences(abstract_token.texts_to_sequences(x_abs_test), maxlen=140)

In [None]:
embedding_matrix_abstract.shape

## claims

In [10]:
claims_token = text.Tokenizer()
claims_token.fit_on_texts(data['claims_final'])
claims_word_index = claims_token.word_index
len(claims_word_index)

18594

In [11]:
x_claims_train_seq = sequence.pad_sequences(claims_token.texts_to_sequences(x_claims_train), maxlen=1400)
x_claims_valid_seq = sequence.pad_sequences(claims_token.texts_to_sequences(x_claims_valid), maxlen=1400)
x_claims_test_seq = sequence.pad_sequences(claims_token.texts_to_sequences(x_claims_test), maxlen=1400)

In [None]:
embedding_matrix_claims = np.zeros((len(claims_word_index) + 1, 300)) # 50是词向量的维度,+1 is because the matrix indices start with 0
for word, i in tqdm(claims_word_index.items(), ncols=70):
    word = word.strip('\'')
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_claims[i] = embedding_vector

In [None]:
embedding_matrix_claims.shape

# BiLSTM捕捉语序信息

In [None]:
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

In [None]:
def fusion_network():
    # for abstract
    input_layer_abstract = layers.Input((SEQ_LEN_abstract, ))
    embedding_layer_abstract = layers.Embedding(input_dim=len(VOCAB_SIZE_abstract) + 1,
                                       output_dim=EMBEDDING_DIM,
                                       weights=[embedding_matrix_abstract],
                                       mask_zero=True,
                                       trainable=False)(input_layer_abstract)    
    # embedding_layer_abstract = layers.SpatialDropout1D(dropout)(embedding_layer_abstract)
    lstm_layer_abstract = layers.Bidirectional(LSTM(abs_lstm_num, return_sequences=False))(embedding_layer_abstract)
    dense_abstract_1 = layers.Dense(abs_dense_num, activation=abs_dense_active)(lstm_layer_abstract)
    dropout_abstract_1 = layers.Dropout(dropout)(dense_abstract_1)
    output_layer_abstract = layers.Dense(2, activation=abs_output_active)(dropout_abstract_1)
    abstract_model = models.Model(inputs=input_layer_abstract, outputs=output_layer_abstract)
    abstract_model.compile(optimizer=abs_optimizer, loss=abs_loss, metrics=['accuracy'])

    # for claims
    input_layer_claims = layers.Input((SEQ_LEN_claims, ))
    embedding_layer_claims = layers.Embedding(input_dim=len(VOCAB_SIZE_claims) + 1,
                                       output_dim=EMBEDDING_DIM,
                                       weights=[embedding_matrix_claims],
                                       mask_zero=True,
                                       trainable=False)(input_layer_claims)
    # embedding_layer_claims = layers.SpatialDropout1D(0.3)(embedding_layer_claims)
    lstm_layer_claims = layers.Bidirectional(LSTM(claims_lstm_num, dropout=0.3, return_sequences=False))(embedding_layer_claims)
    # model1.add(Bidirectional(LSTM(64,dropout=0.4, recurrent_dropout=0.4),merge_mode='concat'))
    dense_claims_1 = layers.Dense(64, activation="relu")(lstm_layer_claims)
    dropout_claims_1 = layers.Dropout(dropout)(dense_claims_1)
    output_layer_claims = layers.Dense(2, activation="softmax")(dropout_claims_1)
    claims_model = models.Model(inputs=input_layer_claims, outputs=output_layer_claims)
    claims_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    # for fusion
    fusion_dense_layer = layers.Dense(256, activation='relu')(concatenate([lstm_layer_abstract, lstm_layer_claims]))
    fusion_dropout_1 = layers.Dropout(dropout)(fusion_dense_layer)
    # flatten_1 = Flatten(name='flatten_1')(dropout_1)
    fusion_dense = layers.Dense(64, activation='relu')(fusion_dropout_1)
    fusion_dropout_2 = layers.Dropout(dropout)(fusion_dense)
    # flatten_2 = Flatten(name='flatten_1')(dropout_2)
    output_layer_fusion = layers.Dense(2, activation="softmax")(fusion_dropout_2)    
    fusion_model = models.Model(inputs=[input_layer_abstract, input_layer_claims], output=output_layer_fusion)
    fusion_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return abstract_model, claims_model, fusion_model

In [None]:
EMBEDDING_DIM = embedding_matrix_abstract.shape[1]
SEQ_LEN_abstract = x_abs_train_seq.shape[1]
SEQ_LEN_claims = x_claims_train_seq.shape[1]
VOCAB_SIZE_abstract = abstract_word_index
VOCAB_SIZE_claims = claims_word_index

abs_lstm_num = 300
abs_dense_num = 32 # 64/32
abs_dense_active = "relu"
abs_output_active = "softmax"
abs_optimizer = optimizers.Adam()
abs_loss = 'binary_crossentropy'
Epoch = 40
BATCH_SIZE = 32
dropout = 0.5 # 0.5

In [None]:
abstract_model, claims_model, fusion_model = fusion_network()

# abstract
filepath = './-'+str(abs_dense_num)+'-'+str(dropout)+'-AbsModel-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
abs_history = abstract_model.fit(x_abs_train_seq, y_category_train, validation_data=(x_abs_valid_seq, y_category_valid), 
                verbose=1, epochs=Epoch, batch_size=BATCH_SIZE, shuffle=False, callbacks=[checkpoint], class_weight=cw)
# abstract_model.save("./abstract_model.hdf5")

## abstract

In [None]:
y_pred = abstract_model.predict(x_abs_test_seq, batch_size=BATCH_SIZE, verbose=0)
# y_pred_1 = np.rint(y_pred) #四舍五入取整
for i in range(len(y_pred)):
    max_value=max(y_pred[i])
    for j in range(len(y_pred[i])):
        if max_value==y_pred[i][j]:
            y_pred[i][j]=1
        else:
            y_pred[i][j]=0
report = classification_report(y_category_test, y_pred, digits=4)
p = re.compile('  |\n', re.S)
report = p.sub(' ', report)
metrics_content = re.findall("([\d]{1}\.[\d]{4})    777", report)

In [None]:
print("acc:  {:.4f}".format(metrics_content[0]))
print("macro:  {:.4f}".format(metrics_content[1]))
print("weighted:  {:.4f}".format(metrics_content[2]))

In [None]:
print(classification_report(y_category_test, y_pred, digits=4))

In [None]:
plot_history(abs_history_list[0])

## claims

In [None]:
# loss, acc = claims_model.evaluate(x_claims_test_seq, y_category_test, batch_size=BATCH_SIZE, verbose=1)
# print("accuracy: {:.4f} loss: {:.4f}".format(acc,loss))

In [None]:
claims_metrics_list = []
for model in tqdm(claims_model_list, ncols=70):
    y_pred = model.predict(x_claims_test_seq, batch_size=BATCH_SIZE, verbose=0)
    y_pred_1 = np.rint(y_pred) #四舍五入取整
    for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
    report = classification_report(y_category_test, y_pred, digits=4)
    p = re.compile('  |\n', re.S)
    report = p.sub(' ', report)
    metrics_content = re.findall("([\d]{1}\.[\d]{4})    777", report)
    claims_metrics_list.append(metrics_content)

In [None]:
claims_metrics_arrat = np.array(claims_metrics_list,dtype=float)
acc = np.mean(claims_metrics_arrat[:,0])
print("acc:  {:.4f}".format(acc))
f1 = np.mean(claims_metrics_arrat[:,1])
print("macro:  {:.4f}".format(f1))
weighted = np.mean(claims_metrics_arrat[:,2])
print("weighted:  {:.4f}".format(weighted))

print(claims_metrics_arrat[:,0].tolist())
print(claims_metrics_arrat[:,1].tolist())
print(claims_metrics_arrat[:,2].tolist())

In [None]:
print(classification_report(y_category_test, y_pred, digits=4))

In [None]:
plot_history(history_list[0])

## fusion

In [None]:
# loss, acc = fusion_model.evaluate([x_abs_test_seq, x_claims_test_seq], test_y, batch_size=BATCH_SIZE, verbose=1)
# print("accuracy: {:.4f} loss: {:.4f}".format(acc,loss))

In [None]:
fusion_metrics_list = []
for model in tqdm(fusion_model_list, ncols=70):
    y_pred = model.predict([x_abs_test_seq,x_claims_test_seq], batch_size=BATCH_SIZE, verbose=0)
    y_pred_1 = np.rint(y_pred) #四舍五入取整
    for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
    report = classification_report(y_category_test, y_pred, digits=4)
    p = re.compile('  |\n', re.S)
    report = p.sub(' ', report)
    metrics_content = re.findall("([\d]{1}\.[\d]{4})    777", report)
    fusion_metrics_list.append(metrics_content)

In [None]:
fusion_metrics_arrat = np.array(fusion_metrics_list,dtype=float)
acc = np.mean(fusion_metrics_arrat[:,0])
print("acc:  {:.4f}".format(acc))
f1 = np.mean(fusion_metrics_arrat[:,1])
print("macro:  {:.4f}".format(f1))
weighted = np.mean(fusion_metrics_arrat[:,2])
print("weighted:  {:.4f}".format(weighted))

print(fusion_metrics_arrat[:,0].tolist())
print(fusion_metrics_arrat[:,1].tolist())
print(fusion_metrics_arrat[:,2].tolist())

In [None]:
print(classification_report(y_category_test, y_pred, digits=4))

In [None]:
plot_history(fusion_history_list[0])