In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



`cd /content/drive/My\ Drive/Transformer-master/` -> `cd /content/drive/My\ Drive/Colab\ Notebooks/Transformer`

In [None]:
cd /content/drive/My\ Drive/Colab\ Notebooks/Transformer

/content/drive/My Drive/Colab Notebooks/Transformer


# ライブラリ読み込み

In [None]:
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.6

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  aptitude-common libcgi-fast-perl libcgi-pm-perl libclass-accessor-perl
  libcwidget3v5 libencode-locale-perl libfcgi-perl libhtml-parser-perl
  libhtml-tagset-perl libhttp-date-perl libhttp-message-perl libio-html-perl
  libio-string-perl liblwp-mediatypes-perl libparse-debianchangelog-perl
  libsigc++-2.0-0v5 libsub-name-perl libtimedate-perl liburi-perl libxapian30
Suggested packages:
  aptitude-doc-en | aptitude-doc apt-xapian-index debtags tasksel
  libcwidget-dev libdata-dump-perl libhtml-template-perl libxml-simple-perl
  libwww-perl xapian-tools
The following NEW packages will be installed:
  aptitude aptitude-common libcgi-fast-perl libcgi-pm-perl
  libclass-accessor-perl libcwidget3v5 libencode-locale-perl libfcgi-perl
  libhtml-parser-perl libhtml-tagset-perl libhttp-date-perl
  libhttp-message-perl libio-html-perl libio-string

In [None]:
import numpy as np
import os
import time
import MeCab

import preprocess_utils
import model
import weight_utils

import tensorflow.keras as keras
import tensorflow as tf
print(tf.__version__)

2.4.0


# 日英翻訳データ ダウンロード

In [None]:
# !wget http://www.manythings.org/anki/jpn-eng.zip
# !unzip ./jpn-eng.zip

# データ読み込み

 corpus_path = './jpn.txt' ->  corpus_path = './DATA/kesen3_ex.tsv'

In [None]:
dataset = preprocess_utils.CreateData(
    corpus_path = './DATA/kesen3_ex.tsv',
    do_shuffle=True,
    seed_value=123,
    split_percent=0.95 # 学習データの割合
)

train_source, train_target, test_source, test_target, train_licence, test_licence = dataset.split_data()

print('**** Amount of data ****')
print('train_source： ', len(train_source))
print('train_target： ', len(train_target))
print('test_source： ', len(test_source))
print('test_target： ', len(test_target))
print('\n')
print('**** Train data example ****')
print('Source Example： ', train_source[0])
print('Target Example： ', train_target[0])
print('Licence： ', train_licence[0])
print('\n')
print('**** Test data example ****')
print('Source Example： ', test_source[0])
print('Target Example： ', test_target[0])
print('Licence： ', test_licence[0])

**** Amount of data ****
train_source：  5021
train_target：  5021
test_source：  264
test_target：  264


**** Train data example ****
Source Example：  イロメイショウ
Target Example：  ロバタ
Licence：  気仙沼市


**** Test data example ****
Source Example：  ライチチオヤニ
Target Example：  クル
Licence：  気仙沼市


In [None]:
import pandas as pd
import re
import codecs
import copy


corpus_path = './DATA/Kesennuma.csv'
df = pd.read_csv(corpus_path)
print('**** Amount of data ****')
print(df)
print('\n')
print('**** Amount of data ****')
#for index, row in df.iterrows():
    #print(row['項目名'])


**** Amount of data ****
     市区町村                項目名       回答語形
0    気仙沼市               起きない       オギネー
1    気仙沼市                来ない        コネー
2    気仙沼市                しない        シネー
3    気仙沼市                しない        スネー
4    気仙沼市             行かなかった     イガナカッタ
..    ...                ...        ...
242  気仙沼市            知事：一つ仮名  ［ts??z??］
243  気仙沼市            地図：一つ仮名   ［ts?z??］
244  気仙沼市   切符（きっぷ）：拍意識・文字意識      き／っ／ぷ
245  気仙沼市   切符（きっぷ）：拍意識・文字意識       きっ／ぷ
246  気仙沼市  風船（ふうせん）：拍意識・文字意識    ふ／う／せ／ん

[247 rows x 3 columns]


**** Amount of data ****


In [None]:
dataset2 = preprocess_utils.CreateData(
    corpus_path = './jpn.txt',
    do_shuffle=True,
    seed_value=123,
    split_percent=0.95 # 学習データの割合
)

train_source2, train_target2, test_source2, test_target2, train_licence2, test_licence2 = dataset2.split_data()

print('**** Amount of data ****')
print('train_source： ', len(train_source2))
print('train_target： ', len(train_target2))
print('test_source： ', len(test_source2))
print('test_target： ', len(test_target2))
print('\n')
print('**** Train data example ****')
print('Source Example： ', train_source2[0])
print('Target Example： ', train_target2[0])
print('Licence： ', train_licence2[0])
print('\n')
print('**** Test data example ****')
print('Source Example： ', test_source2[0])
print('Target Example： ', test_target2[0])
print('Licence： ', test_licence2[0])

# 前処理

In [None]:
BATCH_SIZE = 64 # バッチサイズ
MAX_LENGTH = 60 # シーケンスの長さ
USE_TPU = False # TPUを使うか
BUFFER_SIZE = 50000

In [None]:
train_dataset = preprocess_utils.PreprocessData(
    mecab = MeCab.Tagger("-Ochasen"),
    source_data = train_source,
    target_data = train_target,
    max_length = MAX_LENGTH,
    batch_size = BATCH_SIZE,
    test_flag = False,
    train_dataset = None,
)

train_dataset.preprocess_data()

In [None]:
if USE_TPU:
  tpu_grpc_url = "grpc://" + os.environ["COLAB_TPU_ADDR"]
  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_grpc_url)
  tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)    
  strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)

trainset = tf.data.Dataset.from_tensor_slices((train_dataset.source_vector, train_dataset.target_vector))
trainset = trainset.map(lambda source, target: (tf.cast(source, tf.int64), tf.cast(target, tf.int64))).shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

if USE_TPU:
  trainset = strategy.experimental_distribute_dataset(trainset)

# モデル定義

In [None]:
num_layers=4 # レイヤー数
d_model=64 # 中間層の次元数
num_heads=4 # Multi Head Attentionのヘッド数
dff=2048 # Feed Forward Networkの次元数
dropout_rate = 0.1 # ドロップアウト率

source_vocab_size = max(train_dataset.source_token.values()) + 1 # source文の語彙数
target_vocab_size = max(train_dataset.target_token.values()) + 1 # target文の語彙数

In [None]:
# 重み初期化
def initialize_weight(checkpoint_path, optimizer, transformer, max_length, batch_size, use_tpu=False):

  if os.path.exists(checkpoint_path+'.pkl'):
    if use_tpu:
      number_of_tpu_cores = tpu_cluster_resolver.num_accelerators()['TPU']
      initialize_source, initialize_target = [[1]*max_length]*number_of_tpu_cores, [[1]*max_length]*number_of_tpu_cores
      initialize_set = tf.data.Dataset.from_tensor_slices((initialize_source, initialize_target))
      initialize_set = initialize_set.map(lambda source, target: (tf.cast(source, tf.int64), tf.cast(target, tf.int64))
          ).shuffle(buffer_size=BUFFER_SIZE).batch(batch_size).prefetch(
              buffer_size=tf.data.experimental.AUTOTUNE
          )
      initialize_set = strategy.experimental_distribute_dataset(initialize_set)

      for inp, tar in initialize_set:
        distributed_train_step(inp, tar)

    else:
      initialize_set = tf.ones([batch_size, max_length], tf.int64)
      train_step(initialize_set, initialize_set)
    
    try:
      weight_utils.load_weights_from_pickle(checkpoint_path, optimizer, transformer)
    except:
      print('Failed to load checkpoints.')

  else:
    print('No available checkpoints.')

# 学習実行

checkpoints/gpu/model -> /checkpoints_EX/gpu/model

In [None]:
# Transformer
transformer = model.Transformer(num_layers, d_model, num_heads, dff,
                          source_vocab_size, target_vocab_size, 
                          pe_input=source_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

# Learning Rate
learning_rate = model.CustomSchedule(d_model)

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

# Loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# Loss Function
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

# Metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

# Checkpoint
checkpoint_path = "/content/drive/My Drive/Colab Notebooks/Transformer/checkpoints_EX/gpu/model"

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]
@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = model.create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

# Initialize Weight
initialize_weight(checkpoint_path, optimizer, transformer, MAX_LENGTH, BATCH_SIZE, use_tpu=USE_TPU)

EPOCHS = 30
batch = 0

for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  for inp, tar in trainset:
    train_step(inp, tar)
    
    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
    batch+=1
      
  if (epoch + 1) % 5 == 0:
    print('Saving checkpoint for epoch {} at {}'.format(epoch+1, checkpoint_path))
    weight_utils.save_weights_as_pickle(checkpoint_path, optimizer, transformer)
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

No available checkpoints.
Epoch 1 Batch 0 Loss 0.2619 Accuracy 0.0000
Epoch 1 Batch 50 Loss 0.2339 Accuracy 0.0079
Epoch 1 Loss 0.2208 Accuracy 0.0110
Time taken for 1 epoch: 28.18260431289673 secs

Epoch 2 Batch 100 Loss 0.2010 Accuracy 0.0169
Epoch 2 Batch 150 Loss 0.1922 Accuracy 0.0169
Epoch 2 Loss 0.1911 Accuracy 0.0169
Time taken for 1 epoch: 10.82460069656372 secs

Epoch 3 Batch 200 Loss 0.1865 Accuracy 0.0169
Epoch 3 Loss 0.1849 Accuracy 0.0170
Time taken for 1 epoch: 10.754085540771484 secs

Epoch 4 Batch 250 Loss 0.1814 Accuracy 0.0169
Epoch 4 Batch 300 Loss 0.1688 Accuracy 0.0172
Epoch 4 Loss 0.1685 Accuracy 0.0173
Time taken for 1 epoch: 10.849831104278564 secs

Epoch 5 Batch 350 Loss 0.1474 Accuracy 0.0187
Saving checkpoint for epoch 5 at /content/drive/My Drive/Colab Notebooks/Transformer/checkpoints_EX/gpu/model
Save checkpoints
Epoch 5 Loss 0.1436 Accuracy 0.0193
Time taken for 1 epoch: 11.072546482086182 secs

Epoch 6 Batch 400 Loss 0.1322 Accuracy 0.0215
Epoch 6 Batch