## 1. 파일 읽기

In [None]:
import re
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input,Embedding,TimeDistributed,Dropout,Conv1D,Dense,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate,GRU
from keras.initializers import RandomUniform
from keras.utils import Progbar

Using TensorFlow backend.


In [4]:
def readfile(filename):
    f=open(filename)
    sentences=[]
    sentence=[]
    
    for line in f:
        if(len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n"):
            if(len(sentence)>0):
                sentences.append(sentence)
                sentence=[]
                #print('1', sentence)
            continue
        splits=line.split(' ')
        splits[-1] = re.sub(r'\n','',splits[-1])  ## 191106 추가
        sentence.append([splits[0],splits[-1]])
        #print('2', sentence)
    if(len(sentence)>0):
        sentences.append(sentence)
        #print('3', sentence)
        sentence=[]
    return sentences

> 단어와 개체명만 뽑아 문장으로 만들어 append 했다.

In [5]:
trainSentences=readfile("train.txt")
validSentences=readfile("valid.txt")
testSentences=readfile("test.txt")

In [6]:
trainSentences[0]

[['EU', 'B-ORG'],
 ['rejects', 'O'],
 ['German', 'B-MISC'],
 ['call', 'O'],
 ['to', 'O'],
 ['boycott', 'O'],
 ['British', 'B-MISC'],
 ['lamb', 'O'],
 ['.', 'O']]

>trainSentences(14041,), devSentences(3250,), testSentences(3453,)

## 2. Character 추출 및 단어, Character, label을 리스트로 구성 

In [10]:
def addCharInformation(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars=[c for c in data[0]]            # Character 분리
            Sentences[i][j]=[data[0],chars,data[-1]] # 단어, Chracter, NER을 리스트로
    return Sentences

In [11]:
trainSentences=addCharInformation(trainSentences)

In [12]:
trainSentences[:10]

[[['EU', ['E', 'U'], 'B-ORG'],
  ['rejects', ['r', 'e', 'j', 'e', 'c', 't', 's'], 'O'],
  ['German', ['G', 'e', 'r', 'm', 'a', 'n'], 'B-MISC'],
  ['call', ['c', 'a', 'l', 'l'], 'O'],
  ['to', ['t', 'o'], 'O'],
  ['boycott', ['b', 'o', 'y', 'c', 'o', 't', 't'], 'O'],
  ['British', ['B', 'r', 'i', 't', 'i', 's', 'h'], 'B-MISC'],
  ['lamb', ['l', 'a', 'm', 'b'], 'O'],
  ['.', ['.'], 'O']],
 [['Peter', ['P', 'e', 't', 'e', 'r'], 'B-PER'],
  ['Blackburn', ['B', 'l', 'a', 'c', 'k', 'b', 'u', 'r', 'n'], 'I-PER']],
 [['BRUSSELS', ['B', 'R', 'U', 'S', 'S', 'E', 'L', 'S'], 'B-LOC'],
  ['1996-08-22', ['1', '9', '9', '6', '-', '0', '8', '-', '2', '2'], 'O']],
 [['The', ['T', 'h', 'e'], 'O'],
  ['European', ['E', 'u', 'r', 'o', 'p', 'e', 'a', 'n'], 'B-ORG'],
  ['Commission', ['C', 'o', 'm', 'm', 'i', 's', 's', 'i', 'o', 'n'], 'I-ORG'],
  ['said', ['s', 'a', 'i', 'd'], 'O'],
  ['on', ['o', 'n'], 'O'],
  ['Thursday', ['T', 'h', 'u', 'r', 's', 'd', 'a', 'y'], 'O'],
  ['it', ['i', 't'], 'O'],
  ['disag

In [13]:
validSentences=addCharInformation(validSentences)

In [14]:
testSentences=addCharInformation(testSentences)

## 3. 단어와 label의 중복 제거

In [15]:
labelSet=set()
words={}

In [17]:
for dataset in [trainSentences,validSentences,testSentences]:
    for sentence in dataset:
        for token,char,label in sentence:
            #print(token,char,label)
            labelSet.add(label)       # label 중복제거
            words[token.lower()]=True # 단어를 소문자로 변경후 사전형식(중복제거)으로 저장 

In [18]:
len(words)

26869

> labelSet에 index 붙여주기

In [20]:
labelSet

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [21]:
label2Idx={}
for label in labelSet:
    label2Idx[label]=len(label2Idx)


## 4. word features

In [18]:
case2Idx={'numeric':0,'allLower':1,'allUpper':2,'initialUpper':3,'other':4,'mainly_numeric':5,'contains_digit':6,'PADDING_TOKEN':7}

> index의 숫자가 너무 크기 때문에 8차원의 단위 행렬로 만들어준다.

In [19]:
caseEmbeddings=np.identity(len(case2Idx),dtype='float32')

## 5. gloVe Embedding - 40만개 단어 , 100차원

In [20]:
word2Idx={}
wordEmbeddings=[]
fEmbeddings=open("./dataset/glove.6B.100d.txt",encoding='UTF-8')

In [21]:
for line in fEmbeddings:
    split=line.strip().split(" ") # strip : 공백제거
    word=split[0]  # 단어만 추출
    
    if(len(word2Idx)==0):
        word2Idx["PADDING_TOKEN"]=len(word2Idx)
        vector=np.zeros(len(split)-1)
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"]=len(word2Idx)
        vector=np.random.uniform(-0.25,0.25,len(split)-1)
        wordEmbeddings.append(vector)
    
    if split[0].lower() in words:
        vector=np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]]=len(word2Idx)

In [22]:
wordEmbeddings=np.array(wordEmbeddings)

## 6. Char Features - 단어들의 특징, CNN 학습 데이터

In [23]:
char2Idx={"PADDING":0,"UNKNOWN":1}

In [24]:
for c in "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c]=len(char2Idx)

## 7. PADDING - 글자들을 cnn을 위한 padding 작업

In [25]:
def padding(Sentences):
    maxlen=52
    
    for sentence in Sentences:
        char=sentence[2]    # char만 추출
        for x in char:
            maxlen=max(maxlen,len(x))  # 가장 긴 단어의 길이
    for i,sentence in enumerate(Sentences):
        Sentences[i][2]=pad_sequences(Sentences[i][2],52,padding='post') # 단어들을 패딩 시켜준다.
    return Sentences

## 8. WordFeatures - 단어의 특징을 분석

In [26]:
def getCasing(word,caseLookup):
    casing='other'
    
    numDigits=0
    for char in word:
        if char.isdigit():
            numDigits+=1
    
    digitFraction=numDigits/float(len(word))
    
    if(word.isdigit()):
        casing='numeric'
    elif digitFraction>0.5:
        casing='mainly_numeric'
    elif word.islower():
        casing='allLower'
    elif word.isupper():
        casing='allUpper'
    elif word[0].isupper():
        casing='initialUpper'
    elif numDigits>0:
        casing='contains_digit'
        
    return caseLookup[casing]

## 9. Embedding 단어와 txt 파일의 단어를 비교

> GloVe에는 40만개의 단어이기 때문에 이거와 실제의 데이터의 문장을 비교하여
존재하는 단어들의 vector만 가지고와 저장하는 함수이다.

In [27]:
def createMatrices(sentences,word2Idx,label2Idx,case2Idx,char2Idx):
    unknownIdx=word2Idx['UNKNOWN_TOKEN']
    paddingIdx=word2Idx['PADDING_TOKEN']
    
    dataset=[]
    
    wordCount=0
    unknownWordCount=0
    
    for sentence in sentences:
        wordIndices=[]
        caseIndices=[]
        charIndices=[]
        labelIndices=[]
        
        for word,char,label in sentence:
            wordCount+=1
            if word in word2Idx:
                wordIdx=word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx=word2Idx[word.lower()]
            else:
                wordIdx=unknownIdx
                unknownWordCount+=1
            charIdx=[]
            for x in char:
                charIdx.append(char2Idx[x])
                
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word,case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
        
        dataset.append([wordIndices,caseIndices,charIndices,labelIndices])
    
    return dataset

In [28]:
train_set=padding(createMatrices(trainSentences,word2Idx,label2Idx,case2Idx,char2Idx))
dev_set=padding(createMatrices(devSentences,word2Idx,label2Idx,case2Idx,char2Idx))
test_set=padding(createMatrices(testSentences,word2Idx,label2Idx,case2Idx,char2Idx))

In [29]:
idx2Label={v:k for k,v in label2Idx.items()}
np.save("./output/idx2Label.npy",idx2Label)
np.save("./output/word2Idx.npy",word2Idx)

## 10. 단어들의 길이를 기준으로 1부터 재 정렬하여 리스트에 저장

In [30]:
def createBatches(data):
    l=[]
    for i in data:
        l.append(len(i[0])) # 단어의 길이 
        
    l=set(l)    # 중복제거
    
    batches=[]
    batch_len=[]
    z=0
    
    for i in l:
        for batch in data:
            if len(batch[0])==i:    # 단어어 개수대로 리스트를 정렬한다.
                batches.append(batch)
                z+=1
        batch_len.append(z)
    
    return batches,batch_len

In [31]:
train_batch,train_batch_len=createBatches(train_set)
dev_batch,dev_batch_len=createBatches(dev_set)
test_batch,test_batch_len=createBatches(test_set)

## 11. 배열로 만들기

In [32]:
def iterate_minibatches(dataset,batch_len):
    start=0
    for i in batch_len:
        tokens=[]
        caseing=[]
        char=[]
        labels=[]
        data=dataset[start:i]
        start=i
        for dt in data:
            t,c,ch,l=dt
            l=np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing),np.asarray(char)

## 12. Deep Learning Model 구성

>TimeDistributed : 3차원 텐서를 입력 받을수 있도록 확장,

> char는 52차원으로 vector

In [33]:
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input)

casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)

character_input=Input(shape=(None,52,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

output = concatenate([words, casing,char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)

model = Model(inputs=[words_input, casing_input,character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 30) 2820        char_input[0][0]                 
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 52, 30) 0           char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 52, 30) 2730        dropout_1[0][0]                  
____________________________________________________________________________________________

In [34]:
epochs=50

In [35]:
for epoch in range(epochs):
    print("Epoch %d/%d"%(epoch,epochs))

    a=Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels,tokens,casing,char=batch
        model.train_on_batch([tokens,casing,char],labels)
        a.update(i)
    a.update(i+1)
    print(' ')
# model.save("./output/model.h5")

Epoch 0/50


UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node time_distributed_1/convolution (defined at C:\Users\ezcare14\Anaconda3\envs\shyun\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_4804]

Function call stack:
keras_scratch_graph


-----------------------------------------

## 13. Evaluation

In [36]:
from validation import compute_f1
import random

In [37]:
idx2Word = {v:k for k,v in word2Idx.items()}

In [38]:
def tag_dataset(dataset):
    
    token_list = []
    correctLabels = []
    predLabels = []

    b = Progbar(len(dataset))

    for i,data in enumerate(dataset):
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        
        token_list.append(tokens[0])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            

        correctLabels.append(labels)
        predLabels.append(pred)

        b.update(i)

    b.update(i+1)

    return token_list, predLabels, correctLabels

In [39]:
def evaluation_print(token, pred, cor):
    
    data = []
    
    for i,j,k in zip(token, pred, cor):
        for x,y,q in zip(i,j,k):
            data.append([x,y,q]) # y가 예측 q가 정답
    
    print("{:15}|{:5}|{:5}|{}".format("단어", "예측 값", "실제 값","정답확인"))
    print(35*'-')
    
    for i in range(10):
        check = ''
        random_data = random.choice(data)
        
        if random_data[1] == random_data[2]: # 1이 예측, 2가 정답
            check = 'O'
        else:
            check = 'X'
        
        if random_data[0] != 1:
            print('{:17}:{:9}{:9}{}'.format(idx2Word[random_data[0]], idx2Label[random_data[1]], idx2Label[random_data[2]], check))

------------------------------------------------

### 13.1. Validation data

In [40]:
token_dev, predLabels_dev, correctLabels_dev = tag_dataset(dev_batch)

UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node time_distributed_1/convolution (defined at C:\Users\ezcare14\Anaconda3\envs\shyun\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5179]

Function call stack:
keras_scratch_graph


In [41]:
evaluation_print(token_dev, predLabels_dev, correctLabels_dev)

NameError: name 'token_dev' is not defined

In [42]:
pre_dev, rec_dev, f1_dev = compute_f1(predLabels_dev, correctLabels_dev, idx2Label)
print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev))

NameError: name 'predLabels_dev' is not defined

-------------------------------

### 13.2. Test data

In [43]:
token_test, predLabels_test, correctLabels_test = tag_dataset(test_batch)

UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node time_distributed_1/convolution (defined at C:\Users\ezcare14\Anaconda3\envs\shyun\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5179]

Function call stack:
keras_scratch_graph


In [44]:
evaluation_print(token_test, predLabels_test, correctLabels_test)

NameError: name 'token_test' is not defined

In [45]:
pre_test, rec_test, f1_test= compute_f1(predLabels_test, correctLabels_test, idx2Label)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))

NameError: name 'predLabels_test' is not defined