In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Conv2D, MaxPooling1D, Flatten

from tensorflow.keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

import tensorflow_addons as tfa
from tf2crf import ModelWithCRFLoss, CRF

from tensorflow.keras import backend as K
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.losses import sparse_categorical_crossentropy

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import itertools

In [2]:
# loading training and testing data
data = pd.read_csv('./MSRA/msra_train.csv') # training data
test_data = pd.read_csv('./MSRA/msra_test.csv')# testing data

training_char = data.Character.unique()
testing_char = test_data.Character.unique()
illegal_chars = []

for char in testing_char:
    if char not in training_char:
        illegal_chars.append(char)
        
test_data = test_data[~ test_data.Character.isin(illegal_chars)]

# loading radical dictionary (will use chise in the future)
df_radicals = pd.read_csv('./MSRA/chise_radical.csv')                            
characters = df_radicals['character'].values
radicals = df_radicals['radical_info'].values
char_rad_dict = {}
for i in range (len(characters)):
    exec('char_rad_dict[characters[i]] =' +  radicals[i])

    
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(c, t) for c, t in zip(s["Character"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None
# training sentence
getter = SentenceGetter(data)
sentences = getter.sentences
# testing sentence
getter_te = SentenceGetter(test_data)
sentences_te = getter_te.sentences


df_char_token = pd.read_csv('./MSRA/msra_char_token.csv')
df_radical_token = pd.read_csv('./MSRA/msra_radical_token.csv')
df_tag_token = pd.read_csv('./MSRA/msra_tag_token.csv')

char2idx = {}; radical2idx = {}; tag2idx = {}
for i in range (len(df_char_token)):
    char2idx[df_char_token.char[i]] = df_char_token.token[i]
idx2char = {i: c for c, i in char2idx.items()}

for i in range (len(df_radical_token)):
    radical2idx[df_radical_token.radical[i]] = df_radical_token.token[i]
idx2radical = {i : r for r, i in radical2idx.items()}

for i in range (len(df_tag_token)):
    tag2idx[df_tag_token.tag[i]] = df_tag_token.token[i]
idx2tag = {i: w for w, i in tag2idx.items()}


max_len_char = 90 # the length for each sentence (including character padding)
max_len_radical = 8 # the length for each character (including radical padding)

# character tokenrization
def char_token(sentences, char2idx = char2idx, max_len_char = max_len_char):
    X_char = [[char2idx[c[0]] for c in s] for s in sentences]
    X_char = pad_sequences(maxlen=max_len_char, sequences=X_char, value=char2idx["PAD"], padding='post', truncating='post')
    X_char = np.array(X_char)
    return X_char

X_char = char_token(sentences)
X_char_te = char_token(sentences_te)


# radical tokenrization
def radical_token(sentences, max_len_char = max_len_char, max_len_radical = max_len_radical,
                 radical2idx = radical2idx, char_rad_dict = char_rad_dict):
    X_radical = []
    for sentence in sentences:
        sent_seq = []
        for i in range(max_len_char):
            word_seq = []
            for j in range(max_len_radical): 
                try:
                    char = sentence[i][0][j]
                    if char in char_rad_dict.keys():     
                        radicals = char_rad_dict[char]
                        for i in range(len(radicals)):
                            if i < max_len_radical - 1:    
                                word_seq.append(radical2idx.get(radicals[i]))
                    else:
                        word_seq.append(radical2idx.get("UNK"))
                except:
                    if len(word_seq) < max_len_radical:
                        word_seq.append(radical2idx.get("PAD"))
            sent_seq.append(word_seq)
        X_radical.append((sent_seq))
    X_radical = np.array((X_radical))
    return X_radical

X_radical = radical_token(sentences)
X_radical_te = radical_token(sentences_te)


def tag_token(sentences, max_len_char = max_len_char, tag2idx = tag2idx):
    y = [[tag2idx[c[1]] for c in s] for s in sentences]
    y = pad_sequences(maxlen=max_len_char, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')
    return y

y = tag_token(sentences)
y_te = tag_token(sentences_te)

In [3]:
char_dim = 300
radical_dim = 200

n_chars = len(df_char_token)
n_radical = len(df_radical_token)
n_tags = len(df_tag_token)

# input and embeddings for radicals
char_in = Input(shape=(max_len_char,), )

emb_char = Embedding(input_dim = n_chars, output_dim = char_dim, input_length = max_len_char, mask_zero = True)(char_in)

# input and embeddings for radicals
radical_in = Input(shape=(max_len_char, max_len_radical,), )

emb_radical = TimeDistributed(Embedding(input_dim=n_radical, output_dim=radical_dim,
                           input_length=max_len_radical, mask_zero = True))(radical_in)

# character LSTM to get word encodings by characters
radical_enc = TimeDistributed(Bidirectional(LSTM(units=radical_dim, return_sequences=False,dropout = 0.5, recurrent_dropout=0.5)))(emb_radical)

x = concatenate([emb_char, radical_enc])
# main LSTM
main_lstm = Bidirectional(LSTM(units=char_dim+radical_dim, return_sequences=True,dropout = 0.5, recurrent_dropout=0.5))(x)
dense = TimeDistributed(Dense(char_dim+radical_dim, activation = None))(main_lstm)
crf = tfa.layers.CRF(n_tags)
out = crf(dense)
scheme_1_model = Model(inputs =[char_in, radical_in], outputs = out)
scheme_1_model = ModelWithCRFLoss(scheme_1_model)
scheme_1_model.load_weights('../diss_result/Scheme_1/cp-0181.ckpt')

  return py_builtins.overload_of(f)(*args)


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc10b1e68d0>

In [4]:
n_chars = len(df_char_token)
n_radical = len(df_radical_token)
n_tags = len(df_tag_token)

char_dim = 300
radical_dim = 300

# input and embeddings for characters
char_in = Input(shape=(max_len_char,), )

emb_char = Embedding(input_dim = n_chars, output_dim = char_dim, input_length = max_len_char, mask_zero = True)(char_in)

# input and embeddings for radicals
radical_in = Input(shape=(max_len_char, max_len_radical,), )


emb_radical = TimeDistributed(Embedding(input_dim=n_radical, output_dim=radical_dim,
                           input_length=max_len_radical))(radical_in)

dropout = Dropout(0.5)(emb_radical)

conv1d_out = TimeDistributed(Conv1D(kernel_size = 3, filters=radical_dim, padding='same', activation='tanh', strides=1), name="Convolution")(dropout)

maxpool_out = TimeDistributed(MaxPooling1D(max_len_radical))(conv1d_out)

radical = TimeDistributed(Flatten())(maxpool_out)


x = concatenate([emb_char, radical])
# main LSTM
main_lstm = Bidirectional(LSTM(units=radical_dim + char_dim, return_sequences=True,dropout = 0.5, recurrent_dropout=0.5))(x)
dense = TimeDistributed(Dense(radical_dim + char_dim, activation = None))(main_lstm)
crf = tfa.layers.CRF(n_tags)
out = crf(dense)
scheme_2_model = Model(inputs =[char_in, radical_in], outputs = out)
scheme_2_model = ModelWithCRFLoss(scheme_2_model)
scheme_2_model.load_weights('../diss_result/Scheme_2/cp-0086.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc10b113f50>

In [5]:
n_chars = len(df_char_token)
n_radical = len(df_radical_token)
n_tags = len(df_tag_token)
char_dim = 300
radical_dim = 0


# input and embeddings for characters
char_in = Input(shape=(max_len_char,), )
emb_char = Embedding(input_dim = n_chars, output_dim = char_dim, input_length = max_len_char, mask_zero = True)(char_in)

# main LSTM
main_lstm = Bidirectional(LSTM(units=char_dim + radical_dim, return_sequences=True,dropout = 0.5, recurrent_dropout=0.5))(emb_char)
dense = TimeDistributed(Dense(char_dim + radical_dim, activation = None))(main_lstm)
crf = tfa.layers.CRF(n_tags)
out = crf(dense)
scheme_3_model = Model(inputs =char_in, outputs = out)
scheme_3_model = ModelWithCRFLoss(scheme_3_model)


scheme_3_model.load_weights('../diss_result/Scheme_3/cp-0169.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc10af67dd0>

In [6]:
# filtering the padding tokens
y_te_mask = y_te != 0
y_true = y_te[y_te_mask]
# recover tokens to tags
y_true_recovered = [[idx2tag[token] for token in y_true]]

In [7]:
y_pred_result_scheme_1 = scheme_1_model.predict([X_char_te, X_radical_te])[0]
y_pred_scheme_1 = y_pred_result_scheme_1[y_te_mask]
# recover tokens to tags
y_pred_recovered_scheme_1 = [[idx2tag[token] for token in y_pred_scheme_1]]

print('F1-score of Scheme 1: ', f1_score(y_true_recovered, y_pred_recovered_scheme_1))
print(classification_report(y_true_recovered, y_pred_recovered_scheme_1))

F1-score of Scheme 1:  0.9027613412228797
              precision    recall  f1-score   support

         LOC       0.93      0.91      0.92      2641
         ORG       0.86      0.86      0.86      1197
         PER       0.92      0.88      0.90      1293

   micro avg       0.91      0.89      0.90      5131
   macro avg       0.91      0.89      0.90      5131
weighted avg       0.91      0.89      0.90      5131



In [8]:
y_pred_result_scheme_2 = scheme_2_model.predict([X_char_te, X_radical_te])[0]
y_pred_scheme_2 = y_pred_result_scheme_2[y_te_mask]
# recover tokens to tags
y_pred_recovered_scheme_2 = [[idx2tag[token] for token in y_pred_scheme_2]]

print('F1-score of Scheme 2: ', f1_score(y_true_recovered, y_pred_recovered_scheme_2))
print(classification_report(y_true_recovered, y_pred_recovered_scheme_2))

F1-score of Scheme 2:  0.8992448330683624
              precision    recall  f1-score   support

         LOC       0.93      0.91      0.92      2641
         ORG       0.88      0.84      0.86      1197
         PER       0.93      0.87      0.90      1293

   micro avg       0.92      0.88      0.90      5131
   macro avg       0.91      0.87      0.89      5131
weighted avg       0.92      0.88      0.90      5131



In [9]:
y_pred_result_scheme_3 = scheme_3_model.predict(X_char_te)[0]
y_pred_scheme_3 = y_pred_result_scheme_3[y_te_mask]
# recover tokens to tags
y_pred_recovered_scheme_3 = [[idx2tag[token] for token in y_pred_scheme_3]]

print('F1-score of Scheme 3: ', f1_score(y_true_recovered, y_pred_recovered_scheme_3))
print(classification_report(y_true_recovered, y_pred_recovered_scheme_3))

F1-score of Scheme 3:  0.896388395500296
              precision    recall  f1-score   support

         LOC       0.93      0.90      0.92      2641
         ORG       0.86      0.85      0.85      1197
         PER       0.91      0.88      0.90      1293

   micro avg       0.91      0.89      0.90      5131
   macro avg       0.90      0.88      0.89      5131
weighted avg       0.91      0.89      0.90      5131



In [10]:
def instance_generator(idx):
    # predicted tokens of a specific sentence
    scheme_1_tokens = y_pred_result_scheme_1[idx]
    scheme_2_tokens = y_pred_result_scheme_2[idx]
    scheme_3_tokens = y_pred_result_scheme_3[idx]
    # remove padding tokens
    true_tokens = y_te[idx]
    mask = true_tokens != 0
    true_result = true_tokens[mask]
    
    scheme_1_result = scheme_1_tokens[mask]
    scheme_2_result = scheme_2_tokens[mask]
    scheme_3_result = scheme_3_tokens[mask]
    
    chars = X_char_te[idx][mask]
    
    
    # recover tokens
    
    chars_recovered = [idx2char[token] for token in chars]
    tags_recovered = [idx2tag[token] for token in true_result]
    
    scheme_1_prediction = [idx2tag[token] for token in scheme_1_result]
    scheme_2_prediction = [idx2tag[token] for token in scheme_2_result]
    scheme_3_prediction = [idx2tag[token] for token in scheme_3_result]
    
    
    df = pd.DataFrame(data = {'char': chars_recovered, 'tag': tags_recovered, 
                             'Prediction of Scheme 1': scheme_1_prediction,
                             'Prediction of Scheme 2': scheme_2_prediction,
                             'Prediction of Scheme 3': scheme_3_prediction})
    
    return df



In [11]:
positive_instances = [638, 674, 2970, 3001, 722, 799, 2236, 2403, 2412, 2537, 3140]
instances_cycle = itertools.cycle(positive_instances)

In [12]:
idx = next(instances_cycle)
print(idx)
original_sentence = ''.join(instance_generator(idx).char.values)
print(original_sentence)
instance_generator(idx)

638
祝九三学社第七次全国代表大会圆满成功！


Unnamed: 0,char,tag,Prediction of Scheme 1,Prediction of Scheme 2,Prediction of Scheme 3
0,祝,O,O,O,B-ORG
1,九,B-ORG,B-ORG,B-ORG,I-ORG
2,三,I-ORG,I-ORG,I-ORG,I-ORG
3,学,I-ORG,I-ORG,I-ORG,I-ORG
4,社,I-ORG,I-ORG,I-ORG,I-ORG
5,第,I-ORG,I-ORG,I-ORG,I-ORG
6,七,I-ORG,I-ORG,I-ORG,I-ORG
7,次,I-ORG,I-ORG,I-ORG,I-ORG
8,全,I-ORG,I-ORG,I-ORG,I-ORG
9,国,I-ORG,I-ORG,I-ORG,I-ORG


In [13]:
# sentence_list = []
# for i in range(len(sentences_te)):
#     if len(sentences_te[i]) < 30:
#         sentence_list.append(i)

In [14]:
# scheme_1_predict = scheme_1_model.predict([X_char_te[sentence_list], X_radical_te[sentence_list]])[0]

In [15]:
# scheme_2_predict = scheme_2_model.predict([X_char_te[sentence_list], X_radical_te[sentence_list]])[0]

In [16]:
# scheme_3_predict = scheme_3_model.predict(X_char_te[sentence_list])[0]

In [17]:
# scheme_1_instance = []
# for i in range(len(sentence_list)):
#     if not np.array_equal(scheme_1_predict[i], scheme_3_predict[i]):
#         scheme_1_instance.append(sentence_list[i])
    

In [18]:
# scheme_2_instance = []
# for i in range(len(sentence_list)):
#     if not np.array_equal(scheme_2_predict[i], scheme_3_predict[i]):
#         scheme_2_instance.append(sentence_list[i])

In [19]:
# set(scheme_1_instance).intersection(set(scheme_2_instance))

In [20]:
# sentences_te[3288]

In [21]:
# a = np.expand_dims(X_char_te[3288], axis=0)
# b = np.expand_dims(X_radical_te[3288], axis=0)

In [22]:
# np.array_equal(scheme_2_model.predict([a,b])[0] ,scheme_1_model.predict([a,b])[0])

In [23]:
# scheme_3_model.predict(a)[0]

In [24]:
# scheme_2_model.predict([a,b])[0]

In [25]:
# scheme_1_model.predict([a,b])[0]

In [26]:
# tag2idx