<a href="https://colab.research.google.com/github/Amplil/nlp-test/blob/master/opmnt_library_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 初期化

## インストール

In [None]:
cd "/content/drive/MyDrive/Colab Notebooks"

In [None]:
ls

In [None]:
import sys
sys.path.append("/content/drive/MyDrive/Colab Notebooks/packages")
import slacknotice # オリジナルモジュール

In [None]:
pip install OpenNMT-py

In [None]:
!pip install PyYaml==5.1

## インポート

In [None]:
import yaml
import torch
import torch.nn as nn
from argparse import Namespace
from collections import defaultdict, Counter

In [None]:
import onmt
from onmt.inputters.inputter import _load_vocab, _build_fields_vocab, get_fields, IterOnDevice
from onmt.inputters.corpus import ParallelCorpus
from onmt.inputters.dynamic_iterator import DynamicDatasetIter
from onmt.translate import GNMTGlobalScorer, Translator, TranslationBuilder
from onmt.utils.misc import set_random_seed

# Enable logging

In [None]:
# enable logging
from onmt.utils.logging import init_logger, logger
init_logger()

# Set random seed

In [None]:
is_cuda = torch.cuda.is_available()
set_random_seed(1111, is_cuda)

In [None]:
!ls toy-ende

# Prepare data and vocab

In [None]:
yaml_config = """
## Where the samples will be written
save_data: toy-ende/run/example
## Where the vocab(s) will be written
src_vocab: toy-ende/run/example.vocab.src
tgt_vocab: toy-ende/run/example.vocab.tgt
# Corpus opts:
data:
    corpus:
        path_src: toy-ende/src-train.txt
        path_tgt: toy-ende/tgt-train.txt
        transforms: []
        weight: 1
    valid:
        path_src: toy-ende/src-val.txt
        path_tgt: toy-ende/tgt-val.txt
        transforms: []
"""
config = yaml.safe_load(yaml_config)
with open("toy-ende/config.yaml", "w") as f:
    f.write(yaml_config)

In [None]:
"""
from onmt.utils.parse import ArgumentParser
parser = DynamicArgumentParser(description='build_vocab.py')
"""

In [None]:
from onmt.utils.parse import ArgumentParser
parser = ArgumentParser(description='build_vocab.py')

In [None]:
from onmt.opts import dynamic_prepare_opts
dynamic_prepare_opts(parser, build_vocab_only=True)

In [None]:
base_args = (["-config", "toy-ende/config.yaml", "-n_sample", "10000"])
opts, unknown = parser.parse_known_args(base_args)

In [None]:
opts

In [None]:
from onmt.bin.build_vocab import build_vocab_main
build_vocab_main(opts)

# Build fields

We can build the fields from the text files that were just created.

In [None]:
src_vocab_path = "toy-ende/run/example.vocab.src"
tgt_vocab_path = "toy-ende/run/example.vocab.tgt"

In [None]:
# initialize the frequency counter
counters = defaultdict(Counter)
# load source vocab
_src_vocab, _src_vocab_size = _load_vocab(
    src_vocab_path,
    'src',
    counters)
# load target vocab
_tgt_vocab, _tgt_vocab_size = _load_vocab(
    tgt_vocab_path,
    'tgt',
    counters)

In [None]:
# initialize fields
src_nfeats, tgt_nfeats = 0, 0 # do not support word features for now
fields = get_fields(
    'text', src_nfeats, tgt_nfeats)

In [None]:
fields

In [None]:
# build fields vocab
share_vocab = False
vocab_size_multiple = 1
src_vocab_size = 30000
tgt_vocab_size = 30000
src_words_min_frequency = 1
tgt_words_min_frequency = 1
vocab_fields = _build_fields_vocab(
    fields, counters, 'text', share_vocab,
    vocab_size_multiple,
    src_vocab_size, src_words_min_frequency,
    tgt_vocab_size, tgt_words_min_frequency)

An alternative way of creating these fields is to run `onmt_train` without actually training, to just output the necessary files.

# Prepare for training: model and optimizer creation

Let's get a few fields/vocab related variables to simplify the model creation a bit:

In [None]:
src_text_field = vocab_fields["src"].base_field
src_vocab = src_text_field.vocab
src_padding = src_vocab.stoi[src_text_field.pad_token]

tgt_text_field = vocab_fields['tgt'].base_field
tgt_vocab = tgt_text_field.vocab
tgt_padding = tgt_vocab.stoi[tgt_text_field.pad_token]

Next we specify the core model itself. Here we will build a small model with an encoder and an attention based input feeding decoder. Both models will be RNNs and the encoder will be bidirectional

In [None]:
emb_size = 100
rnn_size = 500
# Specify the core model.

encoder_embeddings = onmt.modules.Embeddings(emb_size, len(src_vocab),
                                             word_padding_idx=src_padding)

encoder = onmt.encoders.RNNEncoder(hidden_size=rnn_size, num_layers=1,
                                   rnn_type="LSTM", bidirectional=True,
                                   embeddings=encoder_embeddings)

decoder_embeddings = onmt.modules.Embeddings(emb_size, len(tgt_vocab),
                                             word_padding_idx=tgt_padding)
decoder = onmt.decoders.decoder.InputFeedRNNDecoder(
    hidden_size=rnn_size, num_layers=1, bidirectional_encoder=True, 
    rnn_type="LSTM", embeddings=decoder_embeddings)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = onmt.models.model.NMTModel(encoder, decoder)
model.to(device)

# Specify the tgt word generator and loss computation module
model.generator = nn.Sequential(
    nn.Linear(rnn_size, len(tgt_vocab)),
    nn.LogSoftmax(dim=-1)).to(device)

loss = onmt.utils.loss.NMTLossCompute(
    criterion=nn.NLLLoss(ignore_index=tgt_padding, reduction="sum"),
    generator=model.generator)

Now we set up the optimizer. This could be a core torch optim class, or our wrapper which handles learning rate updates and gradient normalization automatically.

In [None]:
lr = 1
torch_optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optim = onmt.utils.optimizers.Optimizer(
    torch_optimizer, learning_rate=lr, max_grad_norm=2)

# Create the training and validation data iterators

Now we need to create the dynamic dataset iterator.

This is not very 'library-friendly' for now because of the way the `DynamicDatasetIter` constructor is defined. It may evolve in the future.

In [None]:
src_train = "toy-ende/src-train.txt"
tgt_train = "toy-ende/tgt-train.txt"
src_val = "toy-ende/src-val.txt"
tgt_val = "toy-ende/tgt-val.txt"

# build the ParallelCorpus
corpus = ParallelCorpus("corpus", src_train, tgt_train)
valid = ParallelCorpus("valid", src_val, tgt_val)

In [None]:
# build the training iterator
train_iter = DynamicDatasetIter(
    corpora={"corpus": corpus},
    corpora_info={"corpus": {"weight": 1}},
    transforms={},
    fields=vocab_fields,
    is_train=True,
    batch_type="tokens",
    batch_size=4096,
    batch_size_multiple=1,
    data_type="text")

In [None]:
# make sure the iteration happens on GPU 0 (-1 for CPU, N for GPU N)
train_iter = iter(IterOnDevice(train_iter, 0))

In [None]:
# build the validation iterator
valid_iter = DynamicDatasetIter(
    corpora={"valid": valid},
    corpora_info={"valid": {"weight": 1}},
    transforms={},
    fields=vocab_fields,
    is_train=False,
    batch_type="sents",
    batch_size=8,
    batch_size_multiple=1,
    data_type="text")

In [None]:
valid_iter = IterOnDevice(valid_iter, 0)

# Training

Finally we train.

In [None]:
report_manager = onmt.utils.ReportMgr(
    report_every=50, start_time=None, tensorboard_writer=None)

trainer = onmt.Trainer(model=model,
                       train_loss=loss,
                       valid_loss=loss,
                       optim=optim,
                       report_manager=report_manager,
                       dropout=[0.1])

trainer.train(train_iter=train_iter,
              train_steps=1000,
              valid_iter=valid_iter,
              valid_steps=500)

# Translate

For translation, we can build a "traditional" (as opposed to dynamic) dataset for now.

In [None]:
src_data

In [None]:
src_val

In [None]:
src_data = {"reader": onmt.inputters.str2reader["text"](), "data": src_val}
tgt_data = {"reader": onmt.inputters.str2reader["text"](), "data": tgt_val}
_readers, _data = onmt.inputters.Dataset.config(
    [('src', src_data), ('tgt', tgt_data)])

In [None]:
dataset = onmt.inputters.Dataset(
    vocab_fields, readers=_readers, data=_data,
    sort_key=onmt.inputters.str2sortkey["text"])

In [None]:
data_iter = onmt.inputters.OrderedIterator(
            dataset=dataset,
            device="cuda",
            batch_size=10,
            train=False,
            sort=False,
            sort_within_batch=True,
            shuffle=False
        )

In [None]:
src_reader = onmt.inputters.str2reader["text"]
tgt_reader = onmt.inputters.str2reader["text"]
scorer = GNMTGlobalScorer(alpha=0.7, 
                          beta=0., 
                          length_penalty="avg", 
                          coverage_penalty="none")
gpu = 0 if torch.cuda.is_available() else -1
translator = Translator(model=model, 
                        fields=vocab_fields, 
                        src_reader=src_reader, 
                        tgt_reader=tgt_reader, 
                        global_scorer=scorer,
                        gpu=gpu)
builder = onmt.translate.TranslationBuilder(data=dataset, 
                                            fields=vocab_fields)

**Note**: translations will be very poor, because of the very low quantity of data, the absence of proper tokenization, and the brevity of the training.

In [None]:
for batch in data_iter:
    trans_batch = translator.translate_batch(
        batch=batch, src_vocabs=[src_vocab],
        attn_debug=False)
    translations = builder.from_batch(trans_batch)
    for trans in translations:
        print(trans.log(0))
    break