In [1]:
import re
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
##전역 변수
vocab_size = 4000
max_len = 70
tag_size = 10
word_len = 25
char_size = 52

## 파일을 읽고 

In [3]:
def readfile(filename):
    f = open(filename, 'r')
    tagged_sentences = []
    sentence = []

    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:
                tagged_sentences.append(sentence)
                sentence = []
            continue
        splits = line.split(' ') # 공백을 기준으로 속성을 구분한다.
        splits[-1] = re.sub(r'\n', '', splits[-1]) # 줄바꿈 표시 \n을 제거한다.
        #word = splits[0].lower() # 단어들은 소문자로 바꿔서 저장한다.
        sentence.append([splits[0], splits[-1]]) # 단어와 개체명 태깅만 기록한다.
    return tagged_sentences

In [4]:
trainSentences=readfile("train.txt")

In [5]:
validSentences=readfile("valid.txt")
testSentences=readfile("test.txt")

In [6]:
def seperatearray(rawsentence):
    sentences, ner_tags = [], [] 
    for tagged_sentence in rawsentence: # 14,041개의 문장 샘플을 1개씩 불러온다.
        sentence, tag_info = zip(*tagged_sentence) # 각 샘플에서 단어들은 sentence에 개체명 태깅 정보들은 tag_info에 저장.
        sentences.append(list(sentence)) # 각 샘플에서 단어 정보만 저장한다.
        ner_tags.append(list(tag_info)) # 각 샘플에서 개체명 태깅 정보만 저장한다.
    return sentences, ner_tags

In [7]:
train_sentence, train_tag = seperatearray(trainSentences)
valid_sentence, valid_tag = seperatearray(validSentences)
test_sentence, test_tag = seperatearray(testSentences)

In [8]:
train_sentence[:3]

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22']]

In [9]:
def sentencetochar(sentences):
    charpaddedsentence=[]
    for sentence in sentences:
        newSentence=[]
        makesentence =[]
        makesentence.extend(sentence)
        while len(makesentence)<max_len:
            makesentence.append("#")
        if len(makesentence)>max_len:
            makesentence=makesentence[:max_len]
        for words in makesentence:
            if len(words) > char_size:
                words=words[:char_size]
            newSentence.append([words.ljust(char_size,'#')])
            

        charpaddedsentence.append(newSentence)
    return charpaddedsentence

In [10]:
train_char= sentencetochar(train_sentence)

valid_char = sentencetochar(valid_sentence)
test_char = sentencetochar(test_sentence)

In [11]:
char2Idx={"#":0,"UNKNOWN":1}
for c in "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;'\"/\\%$`&=*+@^~|":
    char2Idx[c]=len(char2Idx)

In [12]:
def addCharInformation(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars=[c for c in data[0]]            # Character 분리
            Sentences[i][j]=chars # 단어, Chracter, NER을 리스트로
    return Sentences

In [13]:
train_char = addCharInformation(train_char)
valid_char = addCharInformation(valid_char)
test_char = addCharInformation(test_char)

In [14]:
def addCharInformation2(Sentences):
    total=[]
    for i,sentence in enumerate(Sentences):
        Sen=[]
        for j,data in enumerate(sentence):
            changeInt=[]
            for k, chars in enumerate(data):
                changeInt.append(char2Idx[chars])
                # print(chars)
            # print(changeInt)
            Sen.append(changeInt) # 단어, Chracter, NER을 리스트로
        total.append(Sen)
    return total

In [15]:
X_char_train = addCharInformation2(train_char)
X_char_valid = addCharInformation2(valid_char)
X_char_test = addCharInformation2(test_char)

In [16]:
X_char_train = np.array([np.array(x1) for x1 in X_char_train])
X_char_valid = np.array([np.array(x1) for x1 in X_char_valid])
X_char_test = np.array([np.array(x1) for x1 in X_char_test])

In [17]:
max_words = 4000        #문장 데이터에 있는 모든 단어를 사용하지 않고 높은 빈도수를 가진 상위 약 4,000개의 단어만을 사용
src_tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')         #tokenizer 객체 생성
src_tokenizer.fit_on_texts(train_sentence)                              #인덱스 구축 

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(train_tag)

In [18]:
tar_tokenizer

<keras_preprocessing.text.Tokenizer at 0x2e0e9995388>

In [19]:
vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1
print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))

단어 집합의 크기 : 4000
개체명 태깅 정보 집합의 크기 : 10


In [20]:
X_train = src_tokenizer.texts_to_sequences(train_sentence)
y_train = tar_tokenizer.texts_to_sequences(train_tag)

In [21]:
print(train_sentence[:3])

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22']]


In [22]:
X_valid = src_tokenizer.texts_to_sequences(valid_sentence)
y_valid = tar_tokenizer.texts_to_sequences(valid_tag)

X_test = src_tokenizer.texts_to_sequences(test_sentence)
y_test = tar_tokenizer.texts_to_sequences(test_tag)

In [23]:
X_train[:3]

[[989, 1, 205, 629, 7, 3939, 216, 1, 3], [774, 1872], [726, 150]]

In [24]:
index_to_word = src_tokenizer.index_word
index_to_ner = tar_tokenizer.index_word


In [25]:
index_to_ner

{1: 'o',
 2: 'b-loc',
 3: 'b-per',
 4: 'b-org',
 5: 'i-per',
 6: 'i-org',
 7: 'b-misc',
 8: 'i-loc',
 9: 'i-misc'}

In [26]:
decoded = []
for index in X_test[0] : # 첫번째 샘플 안의 인덱스들에 대해서
    decoded.append(index_to_word[index]) # 다시 단어로 변환

print('기존 문장 : {}'.format(test_sentence[0]))
print('빈도수가 낮은 단어가 OOV 처리된 문장 : {}'.format(decoded))

기존 문장 : ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
빈도수가 낮은 단어가 OOV 처리된 문장 : ['soccer', '-', 'japan', 'get', 'OOV', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.']


In [27]:
max_len = 70
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
# X_train의 모든 샘플들의 길이를 맞출 때 뒤의 공간에 숫자 0으로 채움.
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
# y_train의 모든 샘플들의 길이를 맞출 때 뒤의 공간에 숫자0으로 채움.
X_valid = pad_sequences(X_valid, padding='post', maxlen=max_len)
y_valid = pad_sequences(y_valid, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
y_test = pad_sequences(y_test, padding='post', maxlen=max_len)

In [28]:
y_train = to_categorical(y_train, num_classes=tag_size)
y_valid = to_categorical(y_valid, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)           ##원핫 인코딩 시킴 

In [52]:
y_train.shape

(14041, 70, 10)

In [None]:
# X_char_train = X_char_train.reshape(14041,70,52)

# ==================<< 모델 생성 >>================

In [47]:
words_input = tf.keras.layers.Input(shape=(None, ),dtype='int32', name='modelInput')
words = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True)(words_input)

In [48]:
character_input=tf.keras.layers.Input(shape=(None,52,),name='char_input')
embed_char_out=tf.keras.layers.TimeDistributed(tf.keras.layers.Embedding(len(char2Idx),30,embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= tf.keras.layers.Dropout(0.5)(embed_char_out)
conv1d_out= tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling1D(52))(conv1d_out)
char = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(maxpool_out)
char = tf.keras.layers.Dropout(0.5)(char)

In [49]:
output = tf.keras.layers.concatenate([words, char])
output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(output)

In [50]:
model = tf.keras.Model(inputs=[words_input, character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         [(None, None, 52)]   0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 30) 2790        char_input[0][0]                 
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, None, 52, 30) 0           char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed_11 (TimeDistri (None, None, 52, 30) 2730        dropout_4[0][0]                  
____________________________________________________________________________________________

In [51]:
model.fit([X_train, X_char_train],y_train,epochs=8,batch_size=128)

Train on 14041 samples
Epoch 1/8
  128/14041 [..............................] - ETA: 6:34

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  logits and labels must have the same first dimension, got logits shape [8960,10] and labels shape [89600]
	 [[node loss/time_distributed_14_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at C:\Users\ezcare14\Anaconda3\envs\shyun\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]]
	 [[Reshape_10/_96]]
  (1) Invalid argument:  logits and labels must have the same first dimension, got logits shape [8960,10] and labels shape [89600]
	 [[node loss/time_distributed_14_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at C:\Users\ezcare14\Anaconda3\envs\shyun\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_30669]

Function call stack:
distributed_function -> distributed_function


In [None]:
modelCNN = tf.keras.Model(inputs=character_input, outputs=char)

In [None]:
modelCNN.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [None]:
modelCNN.summary()

In [None]:
fit(X_train, y_train, batch_size=128, epochs=8,  validation_data=(X_test, y_test))

# =======================================================================
# 모델 3. Subclaissing Model 
### - many2many bidirectional LSTM with TimeDistributed 
### - Subclaissing

In [None]:
class TestModel (tf.keras.Model):
    def __init__(self,vocab_size,tag_size):
        super().__init__()
        self.fEmbedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True)
        self.fBiLSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True), merge_mode='concat')
        # self.fBiLSTM_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), merge_mode='concat')
        # self.fDense = (tf.keras.layers.Dense(128, activation='softmax'))
        self.fTimeDistributed = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))

    def call(self, x):
        x = self.fEmbedding(x)
        x = self.fBiLSTM(x)
        # x = self.fDense(x)
        # x = self.fBiLSTM(x)
        x = self.fTimeDistributed(x)
        return x


In [None]:
modelSub = TestModel(4000,10)
subClassInputs = tf.keras.layers.Input(shape=(70, ),dtype='int32')
subClassInputs

In [None]:
modelSub(subClassInputs)
modelSub.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
modelSub.summary()

In [None]:
modelSub.fit(X_train, y_train, batch_size=128, epochs=8,  validation_data=(X_test, y_test))

In [None]:
print("\n 테스트 정확도: %.4f" % (modelSub.evaluate(X_test, y_test,verbose=0)[1]))

# ========================================================================================
# 모델 2. CNN을 추가한 모델

In [None]:
i=10 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test[i]])) # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
true = np.argmax(y_test[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for w, t, pred in zip(X_test[i], true, y_predicted[0]):
    if w != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(index_to_word[w], index_to_ner[t].upper(), index_to_ner[pred].upper()))