In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. 케라스를 이용한 인공신경망 기계번역(Neural Machine Translation)



## 1) 데이터 정렬

In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			#line = normalize('NFD', line).encode('ascii', 'ignore')
			#line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			#line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			#line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = '/content/drive/MyDrive/keTM.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'korean-english.pkl')	# 데이터가 저장된 파일이 생성됨
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: korean-english.pkl
[제1조목적] => [article 1 purpose]
[이 법은 발명을 보호ㆍ장려하고 그 이용을 도모함으로써 기술의 발전을 촉진하여 산업발전에 이바지함을 목적으로 한다] => [the purpose of this act is to promote the development of technologies and to contribute to industrial development by protecting and supporting inventions and promoting the use of inventions]
[제2조정의] => [article 2 definitions]
[이 법에서 사용하는 용어의 뜻은 다음과 같다] => [the terms used in this act shall be defined as follows]
[발명이란 자연법칙을 이용한 기술적 사상의 창작으로서 고도高度한 것을 말한다] => [the term invention means the highly advanced creation of a technical idea utilizing the laws of nature]
[특허발명이란 특허를 받은 발명을 말한다] => [the term patented invention means an invention for which a patent has been granted]
[실시란 다음 각 목의 구분에 따른 행위를 말한다] => [the term practice means any of the following activities]
[물건의 발명인 경우 그 물건을 생산ㆍ사용ㆍ양도ㆍ대여 또는 수입하거나 그 물건의 양도 또는 대여의 청약양도 또는 대여를 위한 전시를 포함한다이하 같다을 하는 행위] => [an invention of a thing manufacturing using assigning leasing or importing the thing or offering to assign or l

## 2) 데이터 분리

In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('korean-english.pkl')

# reduce dataset size
n_sentences = 3000
dataset = raw_dataset[:n_sentences, :]

# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:2000], dataset[2000:]

# save
save_clean_data(dataset, 'korean-english-both.pkl')
save_clean_data(train, 'korean-english-train.pkl')
save_clean_data(test, 'korean-english-test.pkl')

Saved: korean-english-both.pkl
Saved: korean-english-train.pkl
Saved: korean-english-test.pkl


## 3) NMT 모델 생성

In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector	# BPE와 비슷
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))	# Encoder에 해당
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))	# BPE에 해당
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))	# Decoder에 해당
	return model

# load datasets
dataset = load_clean_sentences('korean-english-both.pkl')
train = load_clean_sentences('korean-english-train.pkl')
test = load_clean_sentences('korean-english-test.pkl')

# prepare korean tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('Korean Vocabulary Size: %d' % eng_vocab_size)
print('Korean Max Length: %d' % (eng_length))
# prepare english tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('English Vocabulary Size: %d' % ger_vocab_size)
print('English Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Korean Vocabulary Size: 8695
Korean Max Length: 72
English Vocabulary Size: 3662
English Max Length: 151
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 151, 256)          937472    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 72, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 72, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 72, 8695)          2234615   
Total params: 4,222,711
Trainable params: 4,222,711
Non-trainable params: 0
_________________________________________________________________
None
Ep

<tensorflow.python.keras.callbacks.History at 0x7f86704725c0>

## 2) NMT 모델 평가 및 예측

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu	 # BLEU 계산

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, ger_tokenizer, source)
		#print(raw_dataset[i])
		raw_src, raw_target = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('korean-english-both.pkl')
train = load_clean_sentences('korean-english-train.pkl')
test = load_clean_sentences('korean-english-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[재정을 청구하는 자가 제출하여야 하는 서류 그 밖에 재정에 관하여 필요한 사항은 대통령령으로 정한다], target=[documents to be submitted by petitioners for adjudication and other matters necessary for adjudication shall be prescribed by presidential decree], predicted=[and and and and and and]
src=[제2항에 따라 보정 후의 청구범위에 대한 국어번역문을 제출하는 경우에는 제204조제1항 및 제2항을 적용하지 아니한다], target=[article 204 1 and 2 shall not apply where a korean translation of the amended claims is submitted under paragraph 2], predicted=[and and and and and and]
src=[제2항에도 불구하고 산업통상자원부령으로 정하는 보완수수료를 납부한 경우에는 다음 각 호의 어느 하나에 해당하는 기간에 제1항제1호를 적용받으려는 취지를 적은 서류 또는 이를 증명할 수 있는 서류를 제출할 수 있다], target=[notwithstanding paragraph 2 if the amendment fee prescribed by ordinance of the ministry of trade industry and energy has been paid documents stating the willingness to become entitled to the application of paragraph 1 1 or documents evidencing such willingness may be submitted during the period set in either of the following subparagraphs], predicted=[and and and and

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.008424
BLEU-2: 0.001295
BLEU-3: 0.004924
BLEU-4: 0.006875
test
src=[제23조과학기술인력의 양성ㆍ활용], target=[article 23 training and utilization of human resources in science and technology], predicted=[and and and]
src=[선출원이 「실용신안법」 제35조제4항에 따라 취하한 것으로 보는 국제실용신안등록출원인 경우], target=[where the earlier application is an international application for registration of a utility model deemed voluntarily withdrawn under article 35 4 of the utility model act], predicted=[and and and and]
src=[그 밖에 정보통신망을 운영하는 자로서 대통령령으로 정하는 자], target=[other persons specified by presidential decree among those who operate an information and communications network], predicted=[and and and and and]
src=[제35조제3항·제4항 제36조제2항·제4항 또는 제37조제3항을 위반하여 정보주체에게 알려야 할 사항을 알리지 아니한 자], target=[a person who fails to notify a data subject of necessary information in violation of article 35 3 and 4 36 2 and 4 or 37 3], predicted=[and and and and and]
src=[제31조개인정보 보호책임자의 지정], target=[article 31 designation of privacy officers], predi

In [None]:
%cd /content

/content


In [None]:
%cd /content/drive/My Drive/mydata

!cp /content/english-german.pkl .

In [None]:
drive.flush_and_unmount 

<function google.colab.drive.flush_and_unmount>