https://github.com/nlpai-lab/nlp-bible-code/blob/master/25%EC%9E%A5_%EB%94%A5%EB%9F%AC%EB%8B%9D%20%EA%B8%B0%EB%B0%98%20%EA%B8%B0%EA%B3%84%EB%B2%88%EC%97%AD/%5B25-2%5DKeras%EB%A5%BC%20%EC%9D%B4%EC%9A%A9%ED%95%9C%20Transformer%20%EC%8B%A4%EC%8A%B5.ipynb

In [None]:
!pip install keras-transformer

In [None]:
import numpy as np
from keras_transformer import get_model

#예시 문장
tokens = '안녕하세요 저의 이름은 박찬준입니다. 만나서 반갑습니다. 저는 자연언어처리를 전공으로 하고 있습니다.'.split(' ')

#토큰 딕셔너리 생성
token_dict = {
    '<PAD>': 0,
    '<START>': 1,
    '<END>': 2,
}

#예시문장 토큰화 및 딕셔너리화
for token in tokens:
    if token not in token_dict:
        token_dict[token] = len(token_dict)

#데이터 전처리 작업 (패딩 등)
encoder_inputs_no_padding = []
encoder_inputs, decoder_inputs, decoder_outputs = [], [], []

for i in range(1, len(tokens) - 1):
    encode_tokens, decode_tokens = tokens[:i], tokens[i:]
    encode_tokens = ['<START>'] + encode_tokens + ['<END>'] + ['<PAD>'] * (len(tokens) - len(encode_tokens)) #패딩
    
    output_tokens = decode_tokens + ['<END>', '<PAD>'] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
    
    decode_tokens = ['<START>'] + decode_tokens + ['<END>'] + ['<PAD>'] * (len(tokens) - len(decode_tokens))#패딩
    
    encode_tokens = list(map(lambda x: token_dict[x], encode_tokens))
    decode_tokens = list(map(lambda x: token_dict[x], decode_tokens))
    output_tokens = list(map(lambda x: [token_dict[x]], output_tokens))
    
    encoder_inputs_no_padding.append(encode_tokens[:i + 2])
    encoder_inputs.append(encode_tokens)
    
    decoder_inputs.append(decode_tokens)
    decoder_outputs.append(output_tokens)

# 모델 생성 (keras_transformer 이용)
model = get_model(
    token_num=len(token_dict),
    embed_dim=30,
    encoder_num=3,
    decoder_num=2,
    head_num=3,
    hidden_dim=120,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=np.random.random((14, 30)),
)

#모델 컴파일
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
)

#모델 써머리
model.summary()

# 모델  훈련
model.fit(
    x=[np.asarray(encoder_inputs * 1000), np.asarray(decoder_inputs * 1000)],
    y=np.asarray(decoder_outputs * 1000),
    epochs=5,
)

In [None]:
import numpy as np
from keras_transformer import get_model, decode

#소스 문장
source_tokens = [
    '안녕하세요 저의 이름은 박찬준입니다.'.split(' '),
    '저는 24살입니다.'.split(' '),
]

#타겟 문장
target_tokens = [
    list('Hello My name is Park Chanjun.'),
    list('I am 24 years old.'),
]

#토큰 딕셔너리화 함수
def build_token_dict(token_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for tokens in token_list:
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict


source_token_dict = build_token_dict(source_tokens) #딕셔너리화
target_token_dict = build_token_dict(target_tokens) #딕셔너리화
target_token_dict_inv = {v: k for k, v in target_token_dict.items()} #역으로.

# <START>,<END>와 같은 Special Token 추가
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# 패딩
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]


#모델 생성
model = get_model(
    token_num=max(len(source_token_dict), len(target_token_dict)),
    embed_dim=32,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=128,
    dropout_rate=0.05,
    use_same_embed=False,  # Use different embeddings for different languages
)

#모델 컴파일
model.compile('adam', 'sparse_categorical_crossentropy')

#모델 써머리 
model.summary()

#모델 훈련
model.fit(
    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
    y=np.array(decode_output * 1024),
    epochs=10,
    batch_size=32,
)

# 번역 진행 (Predict)
decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
)

print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))