In [1]:
# Experimental Class for Smiles Enumeration, Iterator and SmilesIterator
# adapted from Keras 1.2.2
from rdkit import Chem
import numpy as np
import threading


class Iterator(object):
    """Abstract base class for data iterators.
    # Arguments
        n: Integer, total number of samples in the dataset to loop over.
        batch_size: Integer, size of a batch.
        shuffle: Boolean, whether to shuffle the data between epochs.
        seed: Random seeding for data shuffling.
    """

    def __init__(self, n, batch_size, shuffle, seed):
        self.n = n
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.batch_index = 0
        self.total_batches_seen = 0
        self.lock = threading.Lock()
        self.index_generator = self._flow_index(n, batch_size, shuffle, seed)
        if n < batch_size:
            raise ValueError('Input data length is shorter than batch_size'
                             'Adjust batch_size')

    def reset(self):
        self.batch_index = 0

    def _flow_index(self, n, batch_size=32, shuffle=False, seed=None):
        # Ensure self.batch_index is 0.
        self.reset()
        while 1:
            if seed is not None:
                np.random.seed(seed + self.total_batches_seen)
            if self.batch_index == 0:
                index_array = np.arange(n)
                if shuffle:
                    index_array = np.random.permutation(n)

            current_index = (self.batch_index * batch_size) % n
            if n > current_index + batch_size:
                current_batch_size = batch_size
                self.batch_index += 1
            else:
                current_batch_size = n - current_index
                self.batch_index = 0
            self.total_batches_seen += 1
            yield (index_array[current_index: current_index +
                               current_batch_size],
                   current_index, current_batch_size)

    def __iter__(self):
        # Needed if we want to do something like:
        # for x, y in data_gen.flow(...):
        return self

    def __next__(self, *args, **kwargs):
        return self.next(*args, **kwargs)


class SmilesIterator(Iterator):
    """Iterator yielding data from a SMILES array.
    # Arguments
        x: Numpy array of SMILES input data.
        y: Numpy array of targets data.
        smiles_data_generator: Instance of `SmilesEnumerator`
            to use for random SMILES generation.
        batch_size: Integer, size of a batch.
        shuffle: Boolean, whether to shuffle the data between epochs.
        seed: Random seed for data shuffling.
        dtype: dtype to use for returned batch.
        Set to keras.backend.floatx if using Keras
    """

    def __init__(self, x, y, smiles_data_generator,
                 batch_size=32, shuffle=False, seed=None,
                 dtype=np.float32
                 ):
        if y is not None and len(x) != len(y):
            raise ValueError('X (images tensor) and y (labels) '
                             'should have the same length. '
                             'Found: X.shape = %s, y.shape = %s' %
                             (np.asarray(x).shape, np.asarray(y).shape))

        self.x = np.asarray(x)

        if y is not None:
            self.y = np.asarray(y)
        else:
            self.y = None
        self.smiles_data_generator = smiles_data_generator
        self.dtype = dtype
        super(SmilesIterator, self).__init__(x.shape[0], batch_size, shuffle,
                                             seed)

    def next(self):
        """For python 2.x.
        # Returns
            The next batch.
        """
        # Keeps under lock only the mechanism which advances
        # the indexing of each batch.
        with self.lock:
            index_array, current_index, current_batch_size =\
                next(self.index_generator)
        # The transformation of images is not under thread lock
        # so it can be done in parallel
        batch_x = np.zeros(
            tuple([current_batch_size] + [self.smiles_data_generator.pad,
                                          self.smiles_data_generator._charlen]),
            dtype=self.dtype)
        for i, j in enumerate(index_array):
            smiles = self.x[j:j + 1]
            x = self.smiles_data_generator.transform(smiles)
            batch_x[i] = x

        if self.y is None:
            return batch_x
        batch_y = self.y[index_array]
        return batch_x, batch_y


class SmilesEnumerator(object):
    """SMILES Enumerator, vectorizer and devectorizer
    #Arguments
        charset: string containing the characters for the vectorization
          can also be generated via the .fit() method
        pad: Length of the vectorization
        leftpad: Add spaces to the left of the SMILES
        isomericSmiles: Generate SMILES containing information about stereogenic centers
        enum: Enumerate the SMILES during transform
        canonical: use canonical SMILES during transform (overrides enum)
    """

    def __init__(self, charset='@C)(=cOn1S2/H[N]\\', pad=120, leftpad=True,
                 isomericSmiles=True, enum=True,
                 canonical=False):
        self._charset = None
        self.charset = charset
        self.pad = pad
        self.leftpad = leftpad
        self.isomericSmiles = isomericSmiles
        self.enumerate = enum
        self.canonical = canonical

    @property
    def charset(self):
        return self._charset

    @charset.setter
    def charset(self, charset):
        self._charset = charset
        self._charlen = len(charset)
        self._char_to_int = dict((c, i) for i, c in enumerate(charset))
        self._int_to_char = dict((i, c) for i, c in enumerate(charset))

    def fit(self, smiles, extra_chars=[], extra_pad=5):
        """Performs extraction of the charset and length of a SMILES datasets
        and sets self.pad and self.charset
        #Arguments
            smiles: Numpy array or Pandas series containing smiles as strings
            extra_chars: List of extra chars to add to the charset
            (e.g. "\\\\" when "/" is present)
            extra_pad: Extra padding to add before or after the
            SMILES vectorization
        """
        charset = set("".join(list(smiles)))
        self.charset = "".join(charset.union(set(extra_chars)))
        self.pad = max([len(smile) for smile in smiles]) + extra_pad

    def randomize_smiles(self, smiles):
        """Perform a randomization of a SMILES string
        must be RDKit sanitizable"""
        m = Chem.MolFromSmiles(smiles)
        ans = list(range(m.GetNumAtoms()))
        np.random.shuffle(ans)
        nm = Chem.RenumberAtoms(m, ans)
        return Chem.MolToSmiles(nm, canonical=self.canonical,
                                isomericSmiles=self.isomericSmiles)

    def transform(self, smiles):
        """Perform an enumeration (randomization) and vectorization of
        a Numpy array of smiles strings
        #Arguments
            smiles: Numpy array or Pandas series containing smiles as strings
        """
        one_hot = np.zeros((smiles.shape[0], self.pad, self._charlen), dtype=np.int8)

        for i, ss in enumerate(smiles):
            if self.enumerate: ss = self.randomize_smiles(ss)
            for j, c in enumerate(ss):
                one_hot[i, j, self._char_to_int[c]] = 1
        return one_hot

    def reverse_transform(self, vect):
        """ Performs a conversion of a vectorized SMILES to a smiles strings
        charset must be the same as used for vectorization.
        #Arguments
            vect: Numpy array of vectorized SMILES.
        """
        smiles = []
        for v in vect:
            # mask v
            v = v[v.sum(axis=1) == 1]
            # Find one hot encoded index with argmax, translate to char
            # and join to string
            smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1))
            smiles.append(smile)
        return np.array(smiles)
   

In [2]:
smiles = np.array(["CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2",
                       "CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2"] * 10
                      )
# Test canonical SMILES vectorization
sm_en = SmilesEnumerator(canonical=True, enum=False)
sm_en.fit(smiles, extra_chars=["\\"])
v = sm_en.transform(smiles)
transformed = sm_en.reverse_transform(v)
print(transformed)
if len(set(transformed)) > 2:
    print("Too many different canonical SMILES generated")

# Test enumeration
sm_en.canonical = False
sm_en.enumerate = True
v2 = sm_en.transform(smiles)
transformed = sm_en.reverse_transform(v2)
if len(set(transformed)) < 3: print("Too few enumerated SMILES generated")

# Reconstruction
reconstructed = sm_en.reverse_transform(v[0:5])
for i, smile in enumerate(reconstructed):
    if smile != smiles[i]:
        print("Error in reconstruction %s %s" % (smile, smiles[i]))
        break

# test Pandas
import pandas as pd

df = pd.DataFrame(smiles)
v = sm_en.transform(df[0])
if v.shape != (20, 52, 18): print("Possible error in pandas use")

# BUG, when batchsize > x.shape[0], then it only returns x.shape[0]!
# Test batch generation
sm_it = SmilesIterator(smiles, np.array([1, 2] * 10), sm_en, batch_size=10,
                       shuffle=True)
X, y = sm_it.next()
if sum(y == 1) - sum(y == 2) > 1:
    print("Unbalanced generation of batches")
if len(X) != 10: print("Error in batchsize generation")

['CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2'
 'CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2'
 'CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2']

In [1]:
from onmt.train_single import main as single_main
from onmt.new_opts import Opts, PreprocessorOpts
from onmt.preprocess import build_save_dataset, build_save_vocab
import onmt.inputters as inputters

In [4]:
def separate_data(data_path, name_to_save, mode):
    lines = open(data_path).read().split('\n')
    with open(name_to_save + "_src-" + mode +".txt", "w") as f1, open(name_to_save + "_tgt-" + mode + ".txt", "w") as f2:
        for line in lines[3: len(lines) - 1]:
            input_text, target_text, *c = line.split('\t')
            input_text,_ = input_text.split('>')
            input_text = input_text.split()
            target_text = target_text.split()
            f1.write(" ".join(input_text) + " \n")
            f2.write(" ".join(target_text) + "\n")

In [5]:
data_path = "data/US_patents_1976-Sep2016_1product_reactions_train.csv"
name_to_save = "data/USP"
mode = "train"
separate_data(data_path, name_to_save, mode)

In [6]:
data_path = "data/US_patents_1976-Sep2016_1product_reactions_valid.csv"
name_to_save = "data/USP"
mode = "valid"
separate_data(data_path, name_to_save, mode)

In [7]:
opt = PreprocessorOpts(train_src=name_to_save + "_src-train.txt", 
                  train_tgt=name_to_save + "_tgt-train.txt", 
                  valid_src=name_to_save + "_src-valid.txt",
                  valid_tgt=name_to_save + "_tgt-valid.txt",
                  save_data="data/USP")

src_nfeats = inputters.get_num_features(
    opt.data_type, opt.train_src, 'src')

tgt_nfeats = inputters.get_num_features(
    opt.data_type, opt.train_tgt, 'tgt')

fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats)

train_dataset_files = build_save_dataset('train', fields, opt)

build_save_dataset('valid', fields, opt)
build_save_vocab(train_dataset_files, fields, opt)

In [4]:
from onmt.train_single import main as single_main
from onmt.new_opts import Opts, PreprocessorOpts
from onmt.preprocess import build_save_dataset, build_save_vocab
import onmt.inputters as inputters

opt = Opts(
        encoder_type="cnn", 
        decoder_type="cnn",
        data="data/USP", 
        save_model="CNN", 
        epochs=10, 
        batch_size=64, 
        gpu_ranks=[0, 1], 
        valid_steps=100, 
        valid_batch_size=1000, 
        decay_steps=1000, 
        start_decay_steps=10000, 
        learning_rate_decay=0.9, 
        optim="adam",
        learning_rate=0.001,
        save_checkpoint_steps=1000,
        dec_layers=4,
        enc_layers=4
          )
single_main(opt, -1)

[2019-01-07 17:33:55,111 INFO] Loading train dataset from data/USP.train.0.pt, number of examples: 189902
[2019-01-07 17:33:55,113 INFO]  * vocabulary size. source = 178; target = 101
[2019-01-07 17:33:55,114 INFO] Building model...
[2019-01-07 17:33:55,275 INFO] NMTModel(
  (encoder): CNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(178, 500, padding_idx=1)
        )
      )
    )
    (linear): Linear(in_features=500, out_features=500, bias=True)
    (cnn): StackedCNN(
      (layers): ModuleList(
        (0): GatedConv(
          (conv): WeightNormConv2d(500, 1000, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
          (dropout): Dropout(p=0.3)
        )
        (1): GatedConv(
          (conv): WeightNormConv2d(500, 1000, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
          (dropout): Dropout(p=0.3)
        )
        (2): GatedConv(
          (conv): WeightNormConv2d(500, 1000, kernel

KeyboardInterrupt: 

In [None]:
python  train.py -data /tmp/de2/data -save_model /tmp/extra \
        -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8  \
        -encoder_type transformer -decoder_type transformer -position_encoding \
        -train_steps 200000  -max_generator_batches 2 -dropout 0.1 \
        -batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 \
        -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 \
        -max_grad_norm 0 -param_init 0  -param_init_glorot \
        -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 \
        -world_size 4 -gpu_ranks 0 1 2 3 

In [5]:
from onmt.train_single import main as single_main
from onmt.new_opts import Opts, PreprocessorOpts
from onmt.preprocess import build_save_dataset, build_save_vocab
import onmt.inputters as inputters

opt = Opts(
        encoder_type="transformer", 
        decoder_type="transformer",
        layers=4,
        norma
        heads=4, 
        rnn_size=256, 
        word_vec_size=256, 
        transformer_ff=256,
        data="data/USP", 
        save_model="CNN", 
        epochs=10, 
        batch_size=64, 
        gpu_ranks=[0], 
        valid_steps=100, 
        valid_batch_size=64, 
        decay_steps=1000, 
        start_decay_steps=10000, 
        learning_rate_decay=0.9, 
        optim="adam",
        learning_rate=0.001,
        save_checkpoint_steps=1000,
        
          )
single_main(opt, -1)

[2019-01-07 21:54:51,391 INFO] Loading train dataset from data/USP.train.0.pt, number of examples: 189902
[2019-01-07 21:54:51,392 INFO]  * vocabulary size. source = 178; target = 101
[2019-01-07 21:54:51,394 INFO] Building model...
[2019-01-07 21:54:52,036 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(178, 256, padding_idx=1)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=256, out_features=256, bias=True)
          (linear_values): Linear(in_features=256, out_features=256, bias=True)
          (linear_query): Linear(in_features=256, out_features=256, bias=True)
          (softmax): Softmax()
          (dropout): Dropout(p=0.3)
          (final_linear): Linear(in_features=256, out_features=256, bias=True)
        )
        (feed_for

[2019-01-07 21:54:52,039 INFO] encoder: 1629184
[2019-01-07 21:54:52,040 INFO] decoder: 2690149
[2019-01-07 21:54:52,041 INFO] * number of parameters: 4319333
[2019-01-07 21:54:52,043 INFO] Starting training on GPU: [0]
[2019-01-07 21:54:52,044 INFO] Start training...
[2019-01-07 21:54:53,109 INFO] Loading train dataset from data/USP.train.0.pt, number of examples: 189902
[2019-01-07 21:54:56,005 INFO] Step 50/100000; acc:  26.89; ppl: 26.45; xent: 3.28; lr: 0.00100; 36849/33702 tok/s;      3 sec
[2019-01-07 21:54:58,729 INFO] Step 100/100000; acc:  40.97; ppl:  7.60; xent: 2.03; lr: 0.00100; 41537/37912 tok/s;      6 sec
[2019-01-07 21:54:58,799 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 21:55:06,007 INFO] Validation perplexity: 5.93472
[2019-01-07 21:55:06,008 INFO] Validation accuracy: 44.3341
[2019-01-07 21:55:08,753 INFO] Step 150/100000; acc:  46.09; ppl:  5.57; xent: 1.72; lr: 0.00100; 11109/9980 tok/s;     16 sec
[2019-01-07 21:5

[2019-01-07 21:58:22,149 INFO] Step 1700/100000; acc:  62.65; ppl:  2.89; xent: 1.06; lr: 0.00100; 41502/37438 tok/s;    209 sec
[2019-01-07 21:58:22,218 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 21:58:29,440 INFO] Validation perplexity: 2.61999
[2019-01-07 21:58:29,441 INFO] Validation accuracy: 65.7953
[2019-01-07 21:58:32,201 INFO] Step 1750/100000; acc:  62.83; ppl:  2.88; xent: 1.06; lr: 0.00100; 10959/9890 tok/s;    219 sec
[2019-01-07 21:58:34,920 INFO] Step 1800/100000; acc:  63.35; ppl:  2.82; xent: 1.04; lr: 0.00100; 40778/37577 tok/s;    222 sec
[2019-01-07 21:58:34,990 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 21:58:42,220 INFO] Validation perplexity: 2.64952
[2019-01-07 21:58:42,221 INFO] Validation accuracy: 65.5346
[2019-01-07 21:58:44,891 INFO] Step 1850/100000; acc:  63.00; ppl:  2.84; xent: 1.04; lr: 0.00100; 10934/9917 tok/s;    232 sec
[2019-01-07 21:58:47,625 INFO] S

[2019-01-07 22:01:55,104 INFO] Validation perplexity: 2.34577
[2019-01-07 22:01:55,106 INFO] Validation accuracy: 69.1426
[2019-01-07 22:01:57,782 INFO] Step 3350/100000; acc:  66.11; ppl:  2.56; xent: 0.94; lr: 0.00100; 10735/9892 tok/s;    425 sec
[2019-01-07 22:02:00,507 INFO] Step 3400/100000; acc:  65.94; ppl:  2.58; xent: 0.95; lr: 0.00100; 40085/36104 tok/s;    427 sec
[2019-01-07 22:02:00,577 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:02:07,807 INFO] Validation perplexity: 2.37797
[2019-01-07 22:02:07,809 INFO] Validation accuracy: 68.5458
[2019-01-07 22:02:10,515 INFO] Step 3450/100000; acc:  66.39; ppl:  2.54; xent: 0.93; lr: 0.00100; 11080/10256 tok/s;    437 sec
[2019-01-07 22:02:13,222 INFO] Step 3500/100000; acc:  65.91; ppl:  2.60; xent: 0.95; lr: 0.00100; 40033/36187 tok/s;    440 sec
[2019-01-07 22:02:13,292 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:02:20,519 INFO] 

[2019-01-07 22:05:24,572 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:05:31,824 INFO] Validation perplexity: 2.2783
[2019-01-07 22:05:31,825 INFO] Validation accuracy: 69.8165
[2019-01-07 22:05:31,827 INFO] Saving checkpoint CNN_step_5000.pt
[2019-01-07 22:05:34,682 INFO] Step 5050/100000; acc:  67.30; ppl:  2.48; xent: 0.91; lr: 0.00100; 10899/9967 tok/s;    642 sec
[2019-01-07 22:05:37,381 INFO] Step 5100/100000; acc:  67.66; ppl:  2.45; xent: 0.90; lr: 0.00100; 38951/35450 tok/s;    644 sec
[2019-01-07 22:05:37,453 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:05:44,700 INFO] Validation perplexity: 2.24711
[2019-01-07 22:05:44,702 INFO] Validation accuracy: 70.6391
[2019-01-07 22:05:47,442 INFO] Step 5150/100000; acc:  67.00; ppl:  2.49; xent: 0.91; lr: 0.00100; 11233/10143 tok/s;    654 sec
[2019-01-07 22:05:50,136 INFO] Step 5200/100000; acc:  67.74; ppl:  2.44; xent: 0.89; lr: 0.001

[2019-01-07 22:09:00,531 INFO] Step 6650/100000; acc:  67.97; ppl:  2.42; xent: 0.88; lr: 0.00100; 11414/10479 tok/s;    847 sec
[2019-01-07 22:09:03,206 INFO] Step 6700/100000; acc:  68.67; ppl:  2.37; xent: 0.86; lr: 0.00100; 39800/35938 tok/s;    850 sec
[2019-01-07 22:09:03,276 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:09:10,518 INFO] Validation perplexity: 2.21766
[2019-01-07 22:09:10,519 INFO] Validation accuracy: 71.0918
[2019-01-07 22:09:13,241 INFO] Step 6750/100000; acc:  69.15; ppl:  2.35; xent: 0.86; lr: 0.00100; 10681/10020 tok/s;    860 sec
[2019-01-07 22:09:15,952 INFO] Step 6800/100000; acc:  68.46; ppl:  2.39; xent: 0.87; lr: 0.00100; 39934/36163 tok/s;    863 sec
[2019-01-07 22:09:16,022 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:09:23,255 INFO] Validation perplexity: 2.18417
[2019-01-07 22:09:23,257 INFO] Validation accuracy: 71.2082
[2019-01-07 22:09:25,983 INFO]

[2019-01-07 22:12:34,527 INFO] Validation perplexity: 2.18861
[2019-01-07 22:12:34,529 INFO] Validation accuracy: 70.7779
[2019-01-07 22:12:37,269 INFO] Step 8350/100000; acc:  68.88; ppl:  2.36; xent: 0.86; lr: 0.00100; 11217/10465 tok/s;   1064 sec
[2019-01-07 22:12:39,988 INFO] Step 8400/100000; acc:  69.37; ppl:  2.32; xent: 0.84; lr: 0.00100; 42198/38390 tok/s;   1067 sec
[2019-01-07 22:12:40,059 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:12:47,283 INFO] Validation perplexity: 2.20407
[2019-01-07 22:12:47,285 INFO] Validation accuracy: 71.119
[2019-01-07 22:12:50,029 INFO] Step 8450/100000; acc:  68.98; ppl:  2.35; xent: 0.85; lr: 0.00100; 10930/9909 tok/s;   1077 sec
[2019-01-07 22:12:52,752 INFO] Step 8500/100000; acc:  69.69; ppl:  2.30; xent: 0.83; lr: 0.00100; 40621/37795 tok/s;   1080 sec
[2019-01-07 22:12:52,822 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:13:00,054 INFO] V

[2019-01-07 22:16:05,749 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:16:12,979 INFO] Validation perplexity: 2.12657
[2019-01-07 22:16:12,981 INFO] Validation accuracy: 72.5541
[2019-01-07 22:16:12,983 INFO] Saving checkpoint CNN_step_10000.pt
[2019-01-07 22:16:15,811 INFO] Step 10050/100000; acc:  70.10; ppl:  2.27; xent: 0.82; lr: 0.00090; 10676/9886 tok/s;   1283 sec
[2019-01-07 22:16:18,536 INFO] Step 10100/100000; acc:  70.04; ppl:  2.27; xent: 0.82; lr: 0.00090; 42415/38672 tok/s;   1285 sec
[2019-01-07 22:16:18,606 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:16:25,838 INFO] Validation perplexity: 2.12536
[2019-01-07 22:16:25,839 INFO] Validation accuracy: 72.5653
[2019-01-07 22:16:28,560 INFO] Step 10150/100000; acc:  69.79; ppl:  2.29; xent: 0.83; lr: 0.00090; 11077/10024 tok/s;   1295 sec
[2019-01-07 22:16:31,227 INFO] Step 10200/100000; acc:  70.34; ppl:  2.26; xent: 0.81; lr:

[2019-01-07 22:19:42,418 INFO] Step 11700/100000; acc:  70.83; ppl:  2.22; xent: 0.80; lr: 0.00081; 39638/37004 tok/s;   1489 sec
[2019-01-07 22:19:42,488 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:19:49,721 INFO] Validation perplexity: 2.09454
[2019-01-07 22:19:49,722 INFO] Validation accuracy: 73.2616
[2019-01-07 22:19:52,440 INFO] Step 11750/100000; acc:  70.93; ppl:  2.22; xent: 0.80; lr: 0.00081; 10962/10052 tok/s;   1499 sec
[2019-01-07 22:19:55,149 INFO] Step 11800/100000; acc:  70.48; ppl:  2.25; xent: 0.81; lr: 0.00081; 39879/36603 tok/s;   1502 sec
[2019-01-07 22:19:55,220 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:20:02,450 INFO] Validation perplexity: 2.10202
[2019-01-07 22:20:02,451 INFO] Validation accuracy: 73.0032
[2019-01-07 22:20:05,213 INFO] Step 11850/100000; acc:  70.08; ppl:  2.27; xent: 0.82; lr: 0.00081; 11194/10135 tok/s;   1512 sec
[2019-01-07 22:20:07,944 I

[2019-01-07 22:23:15,690 INFO] Validation perplexity: 2.19505
[2019-01-07 22:23:15,692 INFO] Validation accuracy: 71.3631
[2019-01-07 22:23:18,412 INFO] Step 13350/100000; acc:  71.61; ppl:  2.17; xent: 0.77; lr: 0.00066; 11294/10294 tok/s;   1705 sec
[2019-01-07 22:23:21,100 INFO] Step 13400/100000; acc:  71.50; ppl:  2.18; xent: 0.78; lr: 0.00066; 39376/36197 tok/s;   1708 sec
[2019-01-07 22:23:21,170 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:23:28,396 INFO] Validation perplexity: 2.17601
[2019-01-07 22:23:28,398 INFO] Validation accuracy: 71.9175
[2019-01-07 22:23:31,104 INFO] Step 13450/100000; acc:  71.41; ppl:  2.17; xent: 0.78; lr: 0.00066; 11031/10054 tok/s;   1718 sec
[2019-01-07 22:23:33,815 INFO] Step 13500/100000; acc:  71.54; ppl:  2.17; xent: 0.77; lr: 0.00066; 39250/36523 tok/s;   1721 sec
[2019-01-07 22:23:33,885 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:23:41,115 I

[2019-01-07 22:26:47,052 INFO] Step 15000/100000; acc:  71.87; ppl:  2.15; xent: 0.76; lr: 0.00053; 40185/36761 tok/s;   1914 sec
[2019-01-07 22:26:47,121 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:26:54,355 INFO] Validation perplexity: 2.16129
[2019-01-07 22:26:54,357 INFO] Validation accuracy: 71.6689
[2019-01-07 22:26:54,358 INFO] Saving checkpoint CNN_step_15000.pt
[2019-01-07 22:26:57,213 INFO] Step 15050/100000; acc:  72.03; ppl:  2.14; xent: 0.76; lr: 0.00053; 10928/9958 tok/s;   1924 sec
[2019-01-07 22:26:59,959 INFO] Step 15100/100000; acc:  71.33; ppl:  2.18; xent: 0.78; lr: 0.00053; 41034/37676 tok/s;   1927 sec
[2019-01-07 22:27:00,031 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:27:07,264 INFO] Validation perplexity: 2.23706
[2019-01-07 22:27:07,265 INFO] Validation accuracy: 71.0098
[2019-01-07 22:27:09,991 INFO] Step 15150/100000; acc:  72.23; ppl:  2.13; xent: 0.76; lr:

[2019-01-07 22:30:18,560 INFO] Validation accuracy: 74.0322
[2019-01-07 22:30:21,259 INFO] Step 16650/100000; acc:  72.43; ppl:  2.11; xent: 0.75; lr: 0.00048; 11124/10197 tok/s;   2128 sec
[2019-01-07 22:30:23,937 INFO] Step 16700/100000; acc:  72.35; ppl:  2.11; xent: 0.75; lr: 0.00048; 39362/35945 tok/s;   2131 sec
[2019-01-07 22:30:24,008 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:30:31,241 INFO] Validation perplexity: 2.12126
[2019-01-07 22:30:31,243 INFO] Validation accuracy: 72.73
[2019-01-07 22:30:33,972 INFO] Step 16750/100000; acc:  72.07; ppl:  2.14; xent: 0.76; lr: 0.00048; 11265/10355 tok/s;   2141 sec
[2019-01-07 22:30:36,669 INFO] Step 16800/100000; acc:  72.37; ppl:  2.12; xent: 0.75; lr: 0.00048; 40997/37721 tok/s;   2144 sec
[2019-01-07 22:30:36,741 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:30:43,978 INFO] Validation perplexity: 2.15792
[2019-01-07 22:30:43,980 INF

[2019-01-07 22:33:49,969 INFO] Step 18300/100000; acc:  72.89; ppl:  2.09; xent: 0.74; lr: 0.00039; 42279/38955 tok/s;   2337 sec
[2019-01-07 22:33:50,041 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:33:57,283 INFO] Validation perplexity: 2.14408
[2019-01-07 22:33:57,284 INFO] Validation accuracy: 72.5707
[2019-01-07 22:33:59,990 INFO] Step 18350/100000; acc:  72.95; ppl:  2.09; xent: 0.74; lr: 0.00039; 10424/9379 tok/s;   2347 sec
[2019-01-07 22:34:02,719 INFO] Step 18400/100000; acc:  72.77; ppl:  2.10; xent: 0.74; lr: 0.00039; 42055/38700 tok/s;   2350 sec
[2019-01-07 22:34:02,790 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:34:10,034 INFO] Validation perplexity: 2.10189
[2019-01-07 22:34:10,036 INFO] Validation accuracy: 73.1857
[2019-01-07 22:34:12,764 INFO] Step 18450/100000; acc:  72.48; ppl:  2.11; xent: 0.75; lr: 0.00039; 11310/10164 tok/s;   2360 sec
[2019-01-07 22:34:15,489 IN

[2019-01-07 22:37:24,160 INFO] Step 19950/100000; acc:  73.30; ppl:  2.08; xent: 0.73; lr: 0.00035; 10968/10038 tok/s;   2551 sec
[2019-01-07 22:37:26,876 INFO] Step 20000/100000; acc:  72.54; ppl:  2.11; xent: 0.75; lr: 0.00031; 40049/36090 tok/s;   2554 sec
[2019-01-07 22:37:26,949 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:37:34,190 INFO] Validation perplexity: 2.08223
[2019-01-07 22:37:34,191 INFO] Validation accuracy: 73.2569
[2019-01-07 22:37:34,193 INFO] Saving checkpoint CNN_step_20000.pt
[2019-01-07 22:37:37,033 INFO] Step 20050/100000; acc:  72.92; ppl:  2.09; xent: 0.74; lr: 0.00031; 11243/10405 tok/s;   2564 sec
[2019-01-07 22:37:39,723 INFO] Step 20100/100000; acc:  73.22; ppl:  2.07; xent: 0.73; lr: 0.00031; 40620/36997 tok/s;   2567 sec
[2019-01-07 22:37:39,796 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:37:47,023 INFO] Validation perplexity: 2.08283
[2019-01-07 22:37:4

[2019-01-07 22:40:52,930 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:41:00,176 INFO] Validation perplexity: 2.13479
[2019-01-07 22:41:00,177 INFO] Validation accuracy: 72.0403
[2019-01-07 22:41:02,896 INFO] Step 21650/100000; acc:  73.39; ppl:  2.06; xent: 0.72; lr: 0.00028; 10959/9904 tok/s;   2770 sec
[2019-01-07 22:41:05,638 INFO] Step 21700/100000; acc:  72.94; ppl:  2.09; xent: 0.74; lr: 0.00028; 39826/36278 tok/s;   2773 sec
[2019-01-07 22:41:05,710 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:41:12,962 INFO] Validation perplexity: 2.16763
[2019-01-07 22:41:12,964 INFO] Validation accuracy: 72.0544
[2019-01-07 22:41:15,707 INFO] Step 21750/100000; acc:  73.15; ppl:  2.08; xent: 0.73; lr: 0.00028; 11479/10356 tok/s;   2783 sec
[2019-01-07 22:41:18,434 INFO] Step 21800/100000; acc:  73.21; ppl:  2.07; xent: 0.73; lr: 0.00028; 39663/36536 tok/s;   2785 sec
[2019-01-07 22:41:18,507 IN

[2019-01-07 22:44:30,001 INFO] Step 23300/100000; acc:  73.11; ppl:  2.07; xent: 0.73; lr: 0.00023; 40301/36605 tok/s;   2977 sec
[2019-01-07 22:44:30,073 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:44:37,311 INFO] Validation perplexity: 2.06681
[2019-01-07 22:44:37,313 INFO] Validation accuracy: 73.6026
[2019-01-07 22:44:40,031 INFO] Step 23350/100000; acc:  73.81; ppl:  2.04; xent: 0.71; lr: 0.00023; 10895/10067 tok/s;   2987 sec
[2019-01-07 22:44:42,762 INFO] Step 23400/100000; acc:  73.62; ppl:  2.05; xent: 0.72; lr: 0.00023; 39988/36118 tok/s;   2990 sec
[2019-01-07 22:44:42,833 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:44:50,068 INFO] Validation perplexity: 2.11099
[2019-01-07 22:44:50,070 INFO] Validation accuracy: 72.8685
[2019-01-07 22:44:52,787 INFO] Step 23450/100000; acc:  73.59; ppl:  2.05; xent: 0.72; lr: 0.00023; 10690/9821 tok/s;   3000 sec
[2019-01-07 22:44:55,531 IN

[2019-01-07 22:48:03,138 INFO] Validation accuracy: 73.4798
[2019-01-07 22:48:05,835 INFO] Step 24950/100000; acc:  73.58; ppl:  2.04; xent: 0.71; lr: 0.00021; 11448/10509 tok/s;   3193 sec
[2019-01-07 22:48:08,511 INFO] Step 25000/100000; acc:  73.39; ppl:  2.06; xent: 0.72; lr: 0.00019; 40768/36832 tok/s;   3195 sec
[2019-01-07 22:48:08,582 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:48:15,817 INFO] Validation perplexity: 2.07031
[2019-01-07 22:48:15,819 INFO] Validation accuracy: 73.4041
[2019-01-07 22:48:15,821 INFO] Saving checkpoint CNN_step_25000.pt
[2019-01-07 22:48:18,641 INFO] Step 25050/100000; acc:  73.76; ppl:  2.04; xent: 0.71; lr: 0.00019; 10624/9782 tok/s;   3206 sec
[2019-01-07 22:48:21,343 INFO] Step 25100/100000; acc:  73.93; ppl:  2.02; xent: 0.70; lr: 0.00019; 39648/36440 tok/s;   3208 sec
[2019-01-07 22:48:21,414 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:48:28,6

[2019-01-07 22:51:32,552 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:51:39,777 INFO] Validation perplexity: 2.14841
[2019-01-07 22:51:39,779 INFO] Validation accuracy: 72.3328
[2019-01-07 22:51:42,518 INFO] Step 26650/100000; acc:  73.45; ppl:  2.05; xent: 0.72; lr: 0.00017; 10883/9906 tok/s;   3409 sec
[2019-01-07 22:51:45,279 INFO] Step 26700/100000; acc:  73.58; ppl:  2.05; xent: 0.72; lr: 0.00017; 40984/37635 tok/s;   3412 sec
[2019-01-07 22:51:45,350 INFO] Loading valid dataset from data/USP.valid.0.pt, number of examples: 29551
[2019-01-07 22:51:52,602 INFO] Validation perplexity: 2.12674
[2019-01-07 22:51:52,603 INFO] Validation accuracy: 72.5092
[2019-01-07 22:51:54,784 INFO] Loading train dataset from data/USP.train.0.pt, number of examples: 189902
[2019-01-07 22:51:57,034 INFO] Step 26750/100000; acc:  74.44; ppl:  2.00; xent: 0.69; lr: 0.00017; 8892/8212 tok/s;   3424 sec
[2019-01-07 22:51:59,730 INFO] Step 26800/100000; acc

KeyboardInterrupt: 

In [1]:
from onmt.translate.translator import build_translator
from onmt.new_opts  import TranslateOpts
import onmt.inputters
import onmt.translate
import onmt
import onmt.model_builder
import onmt.modules
import onmt.opts

opt = TranslateOpts(
   gpu=-1,
    models= ["demo_model_step_29000.pt"],
    src="data/USP_src-valid.txt", 
    tgt="data/USP_tgt-valid.txt", 
    replace_unk=True, 
    verbose=True,
    output="data/USP_demo_pred.txt",
          )
translator = build_translator(opt, report_score=True)
translator.translate(src_path=opt.src,
                     tgt_path=opt.tgt,
                     src_dir=opt.src_dir,
                     batch_size=opt.batch_size,
                     attn_debug=opt.attn_debug)

1


  var = torch.tensor(arr, dtype=self.dtype, device=device)


PRED AVG SCORE: -0.0198, PRED PPL: 1.0200
GOLD AVG SCORE: -0.3363, GOLD PPL: 1.3997


([[tensor(-0.8770)],
  [tensor(-0.0124)],
  [tensor(-0.2436)],
  [tensor(-1.5064)],
  [tensor(-0.1061)],
  [tensor(-0.3343)],
  [tensor(-0.0057)],
  [tensor(-2.0630)],
  [tensor(-0.9585)],
  [tensor(-0.3555)],
  [tensor(-0.0467)],
  [tensor(-0.9754)],
  [tensor(-0.5869)],
  [tensor(-1.2384)],
  [tensor(-0.3307)],
  [tensor(-0.1737)],
  [tensor(-0.0034)],
  [tensor(-0.1133)],
  [tensor(-0.3574)],
  [tensor(-1.6835)],
  [tensor(-0.5869)],
  [tensor(-0.1965)],
  [tensor(-0.0340)],
  [tensor(-0.0652)],
  [tensor(-1.0951)],
  [tensor(-0.7087)],
  [tensor(-0.0974)],
  [tensor(-0.0437)],
  [tensor(-0.2391)],
  [tensor(-0.1346)],
  [tensor(-0.0152)],
  [tensor(-0.0012)],
  [tensor(-1.9566)],
  [tensor(-1.9566)],
  [tensor(-0.5214)],
  [tensor(-0.9882)],
  [tensor(-0.9221)],
  [tensor(-0.5694)],
  [tensor(-0.5874)],
  [tensor(-0.7630)],
  [tensor(-0.5233)],
  [tensor(-0.5294)],
  [tensor(-0.4749)],
  [tensor(-0.0228)],
  [tensor(-0.5563)],
  [tensor(-0.1259)],
  [tensor(-0.0519)],
  [tensor(-1.

In [None]:
opt = Opts(
        encoder_type="transformer", 
        decoder_type="transformer",
        layers=4,
        norma
        heads=4, 
        rnn_size=256, 
        word_vec_size=256, 
        transformer_ff=256,
        data="data/USP", 
        save_model="CNN", 
        epochs=10, 
        batch_size=64, 
        gpu_ranks=[0], 
        valid_steps=100, 
        valid_batch_size=64, 
        decay_steps=1000, 
        start_decay_steps=10000, 
        learning_rate_decay=0.9, 
        optim="adam",
        learning_rate=0.001,
        save_checkpoint_steps=1000,
        
          )
