In [1]:
import json
import random
import argparse
import numpy as np
import torch

import src
from src.slurm import init_signal_handler, init_distributed_mode
from src.utils import bool_flag, initialize_exp
from src.model import check_model_params, build_modules
from src.envs import ENVS, build_env
from src.trainer import Trainer
from src.evaluator import Evaluator


#np.seterr(all='raise')


def get_parser():
    """
    Generate a parameters parser.
    """
    # parse parameters
    parser = argparse.ArgumentParser(description="Language transfer")

    # main parameters
    parser.add_argument("--dump_path", type=str, default="./dumped/",
                        help="Experiment dump path")
    parser.add_argument("--exp_name", type=str, default="debug",
                        help="Experiment name")
    parser.add_argument("--save_periodic", type=int, default=0,
                        help="Save the model periodically (0 to disable)")
    parser.add_argument("--exp_id", type=str, default="",
                        help="Experiment ID")

    # float16 / AMP API
    parser.add_argument("--fp16", type=bool_flag, default=False,
                        help="Run model with float16")
    parser.add_argument("--amp", type=int, default=-1,
                        help="Use AMP wrapper for float16 / distributed / gradient accumulation. Level of optimization. -1 to disable.")

    # model parameters
    parser.add_argument("--emb_dim", type=int, default=256,
                        help="Embedding layer size")
    parser.add_argument("--n_enc_layers", type=int, default=4,
                        help="Number of Transformer layers in the encoder")
    parser.add_argument("--n_dec_layers", type=int, default=4,
                        help="Number of Transformer layers in the decoder")
    parser.add_argument("--n_heads", type=int, default=4,
                        help="Number of Transformer heads")
    parser.add_argument("--dropout", type=float, default=0,
                        help="Dropout")
    parser.add_argument("--attention_dropout", type=float, default=0,
                        help="Dropout in the attention layer")
    parser.add_argument("--share_inout_emb", type=bool_flag, default=True,
                        help="Share input and output embeddings")
    parser.add_argument("--sinusoidal_embeddings", type=bool_flag, default=False,
                        help="Use sinusoidal embeddings")

    # training parameters
    parser.add_argument("--env_base_seed", type=int, default=0,
                        help="Base seed for environments (-1 to use timestamp seed)")
    parser.add_argument("--max_len", type=int, default=512,
                        help="Maximum sequences length")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="Number of sentences per batch")
    parser.add_argument("--optimizer", type=str, default="adam,lr=0.0001",
                        help="Optimizer (SGD / RMSprop / Adam, etc.)")
    parser.add_argument("--clip_grad_norm", type=float, default=5,
                        help="Clip gradients norm (0 to disable)")
    parser.add_argument("--epoch_size", type=int, default=300000,
                        help="Epoch size / evaluation frequency")
    parser.add_argument("--max_epoch", type=int, default=100000,
                        help="Maximum epoch size")
    parser.add_argument("--stopping_criterion", type=str, default="",
                        help="Stopping criterion, and number of non-increase before stopping the experiment")
    parser.add_argument("--validation_metrics", type=str, default="",
                        help="Validation metrics")
    parser.add_argument("--accumulate_gradients", type=int, default=1,
                        help="Accumulate model gradients over N iterations (N times larger batch sizes)")
    parser.add_argument("--num_workers", type=int, default=10,
                        help="Number of CPU workers for DataLoader")
    parser.add_argument("--same_nb_ops_per_batch", type=bool_flag, default=False,
                        help="Generate sequences with the same number of operators in batches.")

    # export data / reload it
    parser.add_argument("--export_data", type=bool_flag, default=False,
                        help="Export data and disable training.")
    parser.add_argument("--reload_data", type=str, default="",
                        help="Load dataset from the disk (train_path1,valid_path1,test_path1)")
    parser.add_argument("--reload_size", type=int, default=-1,
                        help="Reloaded training set size (-1 for everything)")

    # environment parameters
    parser.add_argument("--env_name", type=str, default="char_sp",
                        help="Environment name")
    ENVS[parser.parse_known_args()[0].env_name].register_args(parser)

    # tasks
    #parser.add_argument("--tasks", type=str, default="",
    #                    help="Tasks")

    # beam search configuration
    parser.add_argument("--beam_eval", type=bool_flag, default=False,
                        help="Evaluate with beam search decoding.")
    parser.add_argument("--beam_size", type=int, default=1,
                        help="Beam size, default = 1 (greedy decoding)")
    parser.add_argument("--beam_length_penalty", type=float, default=1,
                        help="Length penalty, values < 1.0 favor shorter sentences, while values > 1.0 favor longer ones.")
    parser.add_argument("--beam_early_stopping", type=bool_flag, default=True,
                        help="Early stopping, stop as soon as we have `beam_size` hypotheses, although longer ones may have better scores.")

    # reload pretrained model / checkpoint
    parser.add_argument("--reload_model", type=str, default="",
                        help="Reload a pretrained model")
    parser.add_argument("--reload_checkpoint", type=str, default="",
                        help="Reload a checkpoint")

    # evaluation
    parser.add_argument("--eval_only", type=bool_flag, default=False,
                        help="Only run evaluations")
    parser.add_argument("--eval_verbose", type=int, default=0,
                        help="Export evaluation details")
    parser.add_argument("--eval_verbose_print", type=bool_flag, default=False,
                        help="Print evaluation details")

    # debug
    parser.add_argument("--debug_slurm", type=bool_flag, default=False,
                        help="Debug multi-GPU / multi-node within a SLURM job")
    parser.add_argument("--debug", help="Enable all debug flags",
                        action="store_true")

    # CPU / multi-gpu / multi-node
    parser.add_argument("--cpu", type=bool_flag, default=False,
                        help="Run on CPU")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Multi-GPU - Local rank")
    parser.add_argument("--master_port", type=int, default=-1,
                        help="Master port (for multi-node SLURM jobs)")

    return parser

In [2]:
def main(params):

    # initialize the multi-GPU / multi-node training
    # initialize experiment / SLURM signal handler for time limit / pre-emption
    init_distributed_mode(params)
    logger = initialize_exp(params)
    init_signal_handler()

    # CPU / CUDA
    if params.cpu:
        assert not params.multi_gpu
    else:
        assert torch.cuda.is_available()
    src.utils.CUDA = not params.cpu

    # build environment / modules / trainer / evaluator
    env = build_env(params)
    modules = build_modules(env, params)
    trainer = Trainer(modules, env, params)
    evaluator = Evaluator(trainer)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals()
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_equations = 0

        while trainer.n_equations < trainer.epoch_size:
            
            if params.export_data:
                trainer.export_data()
            else:
                trainer.enc_dec_step()
            trainer.iter()
            
            # training steps
            #for task_id in np.random.permutation(len(params.tasks)):
            #    task = params.tasks[task_id]
            #    if params.export_data:
            #        trainer.export_data(task)
            #    else:
            #        trainer.enc_dec_step(task)
            #    trainer.iter()
        
        logger.info("============ End of epoch %i ============" % trainer.epoch)
        
        # evaluate perplexity
        scores = evaluator.run_all_evals()

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
        

In [3]:
args=['--exp_name', 'first_train', '--reload_data', "data.prefix.counts.train,data.prefix.counts.valid,data.prefix.counts.test",
      '--reload_size' ,'238', '--emb_dim' ,'8' ,'--n_enc_layers' ,'6', '--n_dec_layers' ,'6' ,'--n_heads', '2' ,
      '--optimizer', "adam,lr=0.0001" , '--batch_size', '32' ,'--epoch_size' ,'300', '--validation_metrics', 
      'valid_prim_fwd_acc', '--cpu', 'true']


parser = get_parser()
params = parser.parse_args(args)


In [4]:
check_model_params(params)

In [5]:
main(params)
#in the nan_test_64, in the training, there is no nan

INFO - 05/17/22 18:19:52 - 0:00:00 - accumulate_gradients: 1
                                     amp: -1
                                     attention_dropout: 0
                                     balanced: False
                                     batch_size: 32
                                     beam_early_stopping: True
                                     beam_eval: False
                                     beam_length_penalty: 1
                                     beam_size: 1
                                     clip_grad_norm: 5
                                     command: python /usr/local/anaconda3/envs/tf/lib/python3.7/site-packages/ipykernel_launcher.py '-f' '/Users/meililiu/Library/Jupyter/runtime/kernel-ca0e2348-ca40-4b1f-a480-5be45615d868.json' --exp_id "b4d09cg7o7"
                                     cpu: True
                                     datalength: 256
                                     debug: False
                                     debug_slurm:

SLURM job: False
0 - Number of nodes: 1
0 - Node ID        : 0
0 - Local rank     : 0
0 - Global rank    : 0
0 - World size     : 1
0 - GPUs per node  : 1
0 - Master         : True
0 - Multi-node     : False
0 - Multi-GPU      : False
0 - Hostname       : Meilis-MacBook-Pro.local
['data.prefix.counts.train', 'data.prefix.counts.valid', 'data.prefix.counts.test']


INFO - 05/17/22 18:19:52 - 0:00:00 - Initialized random generator for worker 0, with seed [0, 0, 0] (base seed=0).
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_croot-udngs7fm/pytorch_1648016055234/work/torch/csrc/utils/python_arg_parser.cpp:1050.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)
INFO - 05/17/22 18:19:52 - 0:00:00 - Creating valid iterator for ...
INFO - 05/17/22 18:19:52 - 0:00:00 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:52 - 0:00:00 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:52 - 0:00:00 - 0/32
INFO - 05/17/22 18:19:52 - 0:00:00 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:52 - 0:00:00 - Creating test iterator for ...
INFO - 05/17/22 18:19:52 - 0:00:00 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:52 - 0:00:00 -

INFO - 05/17/22 18:19:53 - 0:00:01 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:53 - 0:00:01 - Creating valid iterator for ...
INFO - 05/17/22 18:19:53 - 0:00:01 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:53 - 0:00:01 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:53 - 0:00:01 - 0/32
INFO - 05/17/22 18:19:53 - 0:00:01 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:53 - 0:00:01 - Creating test iterator for ...
INFO - 05/17/22 18:19:53 - 0:00:01 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:53 - 0:00:01 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:53 - 0:00:02 - 0/32
INFO - 05/17/22 18:19:53 - 0:00:02 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:53 - 0:00:02 - epoch -> 3.000000
INFO - 05/17/22 18:19:53 - 0:00:02 - valid_xe_loss -> 48.132345
INFO - 05/17/22 18:19:53 - 0:00:02 - valid_acc -> 0.000000
INFO - 05/1

INFO - 05/17/22 18:19:54 - 0:00:02 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:54 - 0:00:02 - Creating valid iterator for ...
INFO - 05/17/22 18:19:54 - 0:00:02 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:54 - 0:00:02 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:54 - 0:00:02 - 0/32
INFO - 05/17/22 18:19:54 - 0:00:03 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:54 - 0:00:03 - Creating test iterator for ...
INFO - 05/17/22 18:19:54 - 0:00:03 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:54 - 0:00:03 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:54 - 0:00:03 - 0/32
INFO - 05/17/22 18:19:54 - 0:00:03 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:54 - 0:00:03 - epoch -> 6.000000
INFO - 05/17/22 18:19:54 - 0:00:03 - valid_xe_loss -> 47.735629
INFO - 05/17/22 18:19:54 - 0:00:03 - valid_acc -> 0.000000
INFO - 05/1

INFO - 05/17/22 18:19:55 - 0:00:03 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:55 - 0:00:03 - Creating valid iterator for ...
INFO - 05/17/22 18:19:55 - 0:00:03 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:55 - 0:00:03 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:55 - 0:00:03 - 0/32
INFO - 05/17/22 18:19:55 - 0:00:04 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:55 - 0:00:04 - Creating test iterator for ...
INFO - 05/17/22 18:19:55 - 0:00:04 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:55 - 0:00:04 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:55 - 0:00:04 - 0/32
INFO - 05/17/22 18:19:55 - 0:00:04 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:55 - 0:00:04 - epoch -> 9.000000
INFO - 05/17/22 18:19:55 - 0:00:04 - valid_xe_loss -> 47.331826
INFO - 05/17/22 18:19:55 - 0:00:04 - valid_acc -> 0.000000
INFO - 05/1

INFO - 05/17/22 18:19:56 - 0:00:04 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:56 - 0:00:04 - Creating valid iterator for ...
INFO - 05/17/22 18:19:56 - 0:00:04 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:56 - 0:00:04 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:56 - 0:00:04 - 0/32
INFO - 05/17/22 18:19:56 - 0:00:05 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:56 - 0:00:05 - Creating test iterator for ...
INFO - 05/17/22 18:19:56 - 0:00:05 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:56 - 0:00:05 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:56 - 0:00:05 - 0/32
INFO - 05/17/22 18:19:56 - 0:00:05 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:56 - 0:00:05 - epoch -> 12.000000
INFO - 05/17/22 18:19:56 - 0:00:05 - valid_xe_loss -> 46.928397
INFO - 05/17/22 18:19:56 - 0:00:05 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:19:57 - 0:00:05 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:57 - 0:00:05 - Creating valid iterator for ...
INFO - 05/17/22 18:19:57 - 0:00:05 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:57 - 0:00:05 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:57 - 0:00:05 - 0/32
INFO - 05/17/22 18:19:57 - 0:00:06 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:57 - 0:00:06 - Creating test iterator for ...
INFO - 05/17/22 18:19:57 - 0:00:06 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:57 - 0:00:06 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:57 - 0:00:06 - 0/32
INFO - 05/17/22 18:19:57 - 0:00:06 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:57 - 0:00:06 - epoch -> 15.000000
INFO - 05/17/22 18:19:57 - 0:00:06 - valid_xe_loss -> 46.523544
INFO - 05/17/22 18:19:57 - 0:00:06 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:19:58 - 0:00:06 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:58 - 0:00:06 - Creating valid iterator for ...
INFO - 05/17/22 18:19:58 - 0:00:06 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:58 - 0:00:06 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:58 - 0:00:06 - 0/32
INFO - 05/17/22 18:19:58 - 0:00:07 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:58 - 0:00:07 - Creating test iterator for ...
INFO - 05/17/22 18:19:58 - 0:00:07 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:58 - 0:00:07 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:58 - 0:00:07 - 0/32
INFO - 05/17/22 18:19:58 - 0:00:07 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:58 - 0:00:07 - epoch -> 18.000000
INFO - 05/17/22 18:19:58 - 0:00:07 - valid_xe_loss -> 46.113838
INFO - 05/17/22 18:19:58 - 0:00:07 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:19:59 - 0:00:07 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:19:59 - 0:00:07 - Creating valid iterator for ...
INFO - 05/17/22 18:19:59 - 0:00:07 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:19:59 - 0:00:07 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:59 - 0:00:08 - 0/32
INFO - 05/17/22 18:19:59 - 0:00:08 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:59 - 0:00:08 - Creating test iterator for ...
INFO - 05/17/22 18:19:59 - 0:00:08 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:19:59 - 0:00:08 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:19:59 - 0:00:08 - 0/32
INFO - 05/17/22 18:19:59 - 0:00:08 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:19:59 - 0:00:08 - epoch -> 21.000000
INFO - 05/17/22 18:19:59 - 0:00:08 - valid_xe_loss -> 45.709687
INFO - 05/17/22 18:19:59 - 0:00:08 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:00 - 0:00:08 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:00 - 0:00:09 - Creating valid iterator for ...
INFO - 05/17/22 18:20:00 - 0:00:09 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:00 - 0:00:09 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:00 - 0:00:09 - 0/32
INFO - 05/17/22 18:20:00 - 0:00:09 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:00 - 0:00:09 - Creating test iterator for ...
INFO - 05/17/22 18:20:00 - 0:00:09 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:00 - 0:00:09 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:00 - 0:00:09 - 0/32
INFO - 05/17/22 18:20:00 - 0:00:09 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:00 - 0:00:09 - epoch -> 24.000000
INFO - 05/17/22 18:20:00 - 0:00:09 - valid_xe_loss -> 45.319645
INFO - 05/17/22 18:20:00 - 0:00:09 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:01 - 0:00:09 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:01 - 0:00:10 - Creating valid iterator for ...
INFO - 05/17/22 18:20:01 - 0:00:10 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:01 - 0:00:10 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:01 - 0:00:10 - 0/32
INFO - 05/17/22 18:20:01 - 0:00:10 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:01 - 0:00:10 - Creating test iterator for ...
INFO - 05/17/22 18:20:01 - 0:00:10 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:01 - 0:00:10 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:01 - 0:00:10 - 0/32
INFO - 05/17/22 18:20:01 - 0:00:10 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:01 - 0:00:10 - epoch -> 27.000000
INFO - 05/17/22 18:20:01 - 0:00:10 - valid_xe_loss -> 44.954434
INFO - 05/17/22 18:20:01 - 0:00:10 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:02 - 0:00:11 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:02 - 0:00:11 - Creating valid iterator for ...
INFO - 05/17/22 18:20:02 - 0:00:11 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:02 - 0:00:11 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:02 - 0:00:11 - 0/32
INFO - 05/17/22 18:20:02 - 0:00:11 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:02 - 0:00:11 - Creating test iterator for ...
INFO - 05/17/22 18:20:02 - 0:00:11 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:02 - 0:00:11 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:02 - 0:00:11 - 0/32
INFO - 05/17/22 18:20:03 - 0:00:11 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:03 - 0:00:11 - epoch -> 30.000000
INFO - 05/17/22 18:20:03 - 0:00:11 - valid_xe_loss -> 44.609956
INFO - 05/17/22 18:20:03 - 0:00:11 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:03 - 0:00:12 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:03 - 0:00:12 - Creating valid iterator for ...
INFO - 05/17/22 18:20:03 - 0:00:12 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:03 - 0:00:12 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:03 - 0:00:12 - 0/32
INFO - 05/17/22 18:20:03 - 0:00:12 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:03 - 0:00:12 - Creating test iterator for ...
INFO - 05/17/22 18:20:03 - 0:00:12 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:03 - 0:00:12 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:03 - 0:00:12 - 0/32
INFO - 05/17/22 18:20:03 - 0:00:12 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:03 - 0:00:12 - epoch -> 33.000000
INFO - 05/17/22 18:20:03 - 0:00:12 - valid_xe_loss -> 44.289423
INFO - 05/17/22 18:20:04 - 0:00:12 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:04 - 0:00:13 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:04 - 0:00:13 - Creating valid iterator for ...
INFO - 05/17/22 18:20:04 - 0:00:13 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:04 - 0:00:13 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:04 - 0:00:13 - 0/32
INFO - 05/17/22 18:20:04 - 0:00:13 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:04 - 0:00:13 - Creating test iterator for ...
INFO - 05/17/22 18:20:04 - 0:00:13 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:04 - 0:00:13 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:04 - 0:00:13 - 0/32
INFO - 05/17/22 18:20:04 - 0:00:13 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:04 - 0:00:13 - epoch -> 36.000000
INFO - 05/17/22 18:20:04 - 0:00:13 - valid_xe_loss -> 43.987733
INFO - 05/17/22 18:20:04 - 0:00:13 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:05 - 0:00:14 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:05 - 0:00:14 -      40 -   92.75 equations/s - 48343.19 words/s - LOSS:  4.3233 - model LR: 1.0000e-04
INFO - 05/17/22 18:20:05 - 0:00:14 - Creating valid iterator for ...
INFO - 05/17/22 18:20:05 - 0:00:14 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:05 - 0:00:14 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:05 - 0:00:14 - 0/32
INFO - 05/17/22 18:20:05 - 0:00:14 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:05 - 0:00:14 - Creating test iterator for ...
INFO - 05/17/22 18:20:05 - 0:00:14 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:05 - 0:00:14 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:05 - 0:00:14 - 0/32
INFO - 05/17/22 18:20:06 - 0:00:14 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:06 - 0:00:14 - epoch -> 39.000000
INFO - 0

INFO - 05/17/22 18:20:06 - 0:00:15 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:06 - 0:00:15 - Creating valid iterator for ...
INFO - 05/17/22 18:20:06 - 0:00:15 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:06 - 0:00:15 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:06 - 0:00:15 - 0/32
INFO - 05/17/22 18:20:06 - 0:00:15 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:06 - 0:00:15 - Creating test iterator for ...
INFO - 05/17/22 18:20:06 - 0:00:15 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:06 - 0:00:15 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:06 - 0:00:15 - 0/32
INFO - 05/17/22 18:20:07 - 0:00:15 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:07 - 0:00:15 - epoch -> 42.000000
INFO - 05/17/22 18:20:07 - 0:00:15 - valid_xe_loss -> 43.448527
INFO - 05/17/22 18:20:07 - 0:00:15 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:07 - 0:00:16 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:07 - 0:00:16 - Creating valid iterator for ...
INFO - 05/17/22 18:20:07 - 0:00:16 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:07 - 0:00:16 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:07 - 0:00:16 - 0/32
INFO - 05/17/22 18:20:07 - 0:00:16 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:07 - 0:00:16 - Creating test iterator for ...
INFO - 05/17/22 18:20:07 - 0:00:16 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:07 - 0:00:16 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:08 - 0:00:16 - 0/32
INFO - 05/17/22 18:20:08 - 0:00:16 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:08 - 0:00:16 - epoch -> 45.000000
INFO - 05/17/22 18:20:08 - 0:00:16 - valid_xe_loss -> 43.215658
INFO - 05/17/22 18:20:08 - 0:00:16 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:08 - 0:00:17 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:08 - 0:00:17 - Creating valid iterator for ...
INFO - 05/17/22 18:20:08 - 0:00:17 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:08 - 0:00:17 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:08 - 0:00:17 - 0/32
INFO - 05/17/22 18:20:08 - 0:00:17 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:09 - 0:00:17 - Creating test iterator for ...
INFO - 05/17/22 18:20:09 - 0:00:17 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:09 - 0:00:17 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:09 - 0:00:17 - 0/32
INFO - 05/17/22 18:20:09 - 0:00:17 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:09 - 0:00:17 - epoch -> 48.000000
INFO - 05/17/22 18:20:09 - 0:00:17 - valid_xe_loss -> 43.001205
INFO - 05/17/22 18:20:09 - 0:00:17 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:09 - 0:00:18 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:09 - 0:00:18 - Creating valid iterator for ...
INFO - 05/17/22 18:20:09 - 0:00:18 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:09 - 0:00:18 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:10 - 0:00:18 - 0/32
INFO - 05/17/22 18:20:10 - 0:00:18 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:10 - 0:00:18 - Creating test iterator for ...
INFO - 05/17/22 18:20:10 - 0:00:18 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:10 - 0:00:18 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:10 - 0:00:18 - 0/32
INFO - 05/17/22 18:20:10 - 0:00:18 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:10 - 0:00:18 - epoch -> 51.000000
INFO - 05/17/22 18:20:10 - 0:00:18 - valid_xe_loss -> 42.803315
INFO - 05/17/22 18:20:10 - 0:00:18 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:10 - 0:00:19 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:11 - 0:00:19 - Creating valid iterator for ...
INFO - 05/17/22 18:20:11 - 0:00:19 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:11 - 0:00:19 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:11 - 0:00:19 - 0/32
INFO - 05/17/22 18:20:11 - 0:00:19 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:11 - 0:00:19 - Creating test iterator for ...
INFO - 05/17/22 18:20:11 - 0:00:19 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:11 - 0:00:19 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:11 - 0:00:19 - 0/32
INFO - 05/17/22 18:20:11 - 0:00:19 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:11 - 0:00:19 - epoch -> 54.000000
INFO - 05/17/22 18:20:11 - 0:00:19 - valid_xe_loss -> 42.618046
INFO - 05/17/22 18:20:11 - 0:00:19 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:11 - 0:00:20 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:12 - 0:00:20 - Creating valid iterator for ...
INFO - 05/17/22 18:20:12 - 0:00:20 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:12 - 0:00:20 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:12 - 0:00:20 - 0/32
INFO - 05/17/22 18:20:12 - 0:00:20 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:12 - 0:00:20 - Creating test iterator for ...
INFO - 05/17/22 18:20:12 - 0:00:20 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:12 - 0:00:20 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:12 - 0:00:20 - 0/32
INFO - 05/17/22 18:20:12 - 0:00:20 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:12 - 0:00:20 - epoch -> 57.000000
INFO - 05/17/22 18:20:12 - 0:00:20 - valid_xe_loss -> 42.449814
INFO - 05/17/22 18:20:12 - 0:00:20 - valid_acc -> 0.000000
INFO - 05/

INFO - 05/17/22 18:20:13 - 0:00:21 - Saving checkpoint to ./dumped/first_train/b4d09cg7o7/checkpoint.pth ...
INFO - 05/17/22 18:20:13 - 0:00:21 - Creating valid iterator for ...
INFO - 05/17/22 18:20:13 - 0:00:21 - Loading data from data.prefix.counts.valid ...
INFO - 05/17/22 18:20:13 - 0:00:21 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:13 - 0:00:21 - 0/32
INFO - 05/17/22 18:20:13 - 0:00:21 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:13 - 0:00:21 - Creating test iterator for ...
INFO - 05/17/22 18:20:13 - 0:00:21 - Loading data from data.prefix.counts.test ...
INFO - 05/17/22 18:20:13 - 0:00:21 - Loaded 32 equations from the disk.
INFO - 05/17/22 18:20:13 - 0:00:21 - 0/32
INFO - 05/17/22 18:20:13 - 0:00:21 - 0/32 (0.0%) equations were evaluated correctly.
INFO - 05/17/22 18:20:13 - 0:00:21 - epoch -> 60.000000
INFO - 05/17/22 18:20:13 - 0:00:21 - valid_xe_loss -> 42.294672
INFO - 05/17/22 18:20:13 - 0:00:21 - valid_acc -> 0.000000
INFO - 05/

KeyboardInterrupt: 

In [None]:
#in the last nan_test dataset, there is no nan-error in training
#may be this nan problem is from my data???
#I'll try to generate a new dataset
#and train again use dataset_640

In [6]:
#line by line running, try to identify problem
# CPU / CUDA
if params.cpu:
    assert not params.multi_gpu
else:
    assert torch.cuda.is_available()
src.utils.CUDA = not params.cpu

# build environment / modules / trainer / evaluator
env = build_env(params)
modules = build_modules(env, params)
trainer = Trainer(modules, env, params)
evaluator = Evaluator(trainer)

# evaluation
if params.eval_only:
    scores = evaluator.run_all_evals()
    for k, v in scores.items():
        logger.info("%s -> %.6f" % (k, v))
    logger.info("__log__:%s" % json.dumps(scores))
    exit()

INFO - 05/17/22 12:09:34 - 0:00:31 - Unary operators: []
INFO - 05/17/22 12:09:34 - 0:00:31 - Binary operators: ['add', 'sub']
INFO - 05/17/22 12:09:34 - 0:00:31 - words: {'<s>': 0, '</s>': 1, '<pad>': 2, '(': 3, ')': 4, 'pi': 5, 'E': 6, 'x': 7, 'abs': 8, 'acos': 9, 'acosh': 10, 'acot': 11, 'acoth': 12, 'acsc': 13, 'acsch': 14, 'add': 15, 'asec': 16, 'asech': 17, 'asin': 18, 'asinh': 19, 'atan': 20, 'atanh': 21, 'cos': 22, 'cosh': 23, 'cot': 24, 'coth': 25, 'csc': 26, 'csch': 27, 'div': 28, 'exp': 29, 'inv': 30, 'ln': 31, 'mul': 32, 'pow': 33, 'pow2': 34, 'pow3': 35, 'pow4': 36, 'pow5': 37, 'rac': 38, 'sec': 39, 'sech': 40, 'sign': 41, 'sin': 42, 'sinh': 43, 'sqrt': 44, 'sub': 45, 'tan': 46, 'tanh': 47, 'INT+': 48, 'INT-': 49, 'INT': 50, '0': 51, '1': 52, '2': 53, '3': 54, '4': 55, '5': 56, '6': 57, '7': 58, '8': 59, '9': 60}
INFO - 05/17/22 12:09:34 - 0:00:31 - 11 possible leaves.
INFO - 05/17/22 12:09:34 - 0:00:31 - Checking expressions in [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 2.1, 3.

['data.prefix.counts.train', 'data.prefix.counts.valid', 'data.prefix.counts.test']


INFO - 05/17/22 12:09:34 - 0:00:31 - Initialized random generator for worker 0, with seed [0, 0, 0] (base seed=0).


In [7]:
trainer.n_equations = 0

#inside the function trainer.enc_dec_step()
params = params
encoder, decoder = modules['encoder'], modules['decoder']
encoder.train()
decoder.train()

# batch x1 is d, x2 is f
(x1, len1), (x2, len2), _ = trainer.get_batch()

In [8]:
#print(x1)
print(x1.dtype)
a = torch.FloatTensor(x1)
print(a.dtype)
#print(x1)
any([any(torch.isinf(i)) for i in x1])

torch.float32
torch.float32


False

In [9]:
# target words to predict
alen = torch.arange(len2.max(), dtype=torch.long, device=len2.device)
pred_mask = alen[:, None] < len2[None] - 1  # do not predict anything given the last target word
y = x2[1:].masked_select(pred_mask[:-1])
assert len(y) == (len2 - 1).sum().item()

In [10]:
y

tensor([46, 15, 15, 15, 32, 32, 32, 32, 32, 15, 32, 32, 33, 15, 15, 32, 15, 33,
        32, 15, 15, 15, 32, 15, 33, 32, 31, 15, 15, 32, 32, 32, 32,  7, 48,  7,
         7, 28, 28,  7, 28,  7, 48,  7,  7, 48,  7,  7,  7,  7, 33, 32, 48, 33,
        48, 48,  7,  7, 32,  7,  7, 48, 15, 48, 33, 32, 52, 15, 15, 48, 48, 15,
        48, 15, 54, 33, 49, 55, 32, 15, 32, 49, 20, 48, 52,  7, 54, 52, 49, 46,
        33, 32, 32, 54,  7, 54,  7, 48, 15, 33, 49, 52, 52,  7, 52, 33, 53, 15,
        52, 32, 49, 49, 49, 52,  7, 54, 15, 48, 32, 15, 52, 33,  7, 28, 49, 53,
        33, 32, 48, 56, 32,  7, 52, 48, 48, 32, 48,  7, 33, 33, 56, 49, 52, 55,
        52, 56, 48,  7, 32, 53,  7, 32, 56,  7, 48, 48, 52, 33,  7,  7, 55, 32,
        49, 49,  7, 55, 55,  6, 55, 49,  7,  7,  0, 52, 29, 32, 29,  0, 53, 32,
        49, 32, 15, 49,  0, 48, 55, 52, 29,  7, 48, 15, 15, 33, 52, 52,  0, 32,
        32,  7, 32, 52, 48, 48, 32,  7, 48,  7, 33, 48, 52, 49, 48, 52, 55,  8,
        48,  7, 48, 53, 48, 48,  7,  7, 

In [49]:
#x1, len1, x2, len2, y = to_cuda(x1, len1, x2, len2, y)

# forward / loss
#myencoded = myencoder.fwd(x1, len1, causal=False)
#encoded = encoder.fwd(x1,len1,causal=False)
encoded = encoder('fwd', x=x1, lengths=len1, causal=False)
encoded = torch.nan_to_num(encoded)
decoded = decoder('fwd', x=x2, lengths=len2, causal=True, src_enc=encoded.transpose(0, 1), src_len=len1/params.emb_dim)
decoded = torch.nan_to_num(decoded)

In [23]:
encoded

tensor([[[ 0.6096,  1.3947, -1.6767,  ...,  0.0805, -0.5045,  1.3094],
         [ 1.3612,  0.3975, -1.1472,  ..., -1.5236,  0.9988, -0.4789],
         [ 0.0827,  1.3161, -2.0203,  ...,  0.8327, -0.9538,  0.7441],
         ...,
         [ 0.1864,  1.1539, -2.4397,  ...,  0.2834, -0.3101,  0.1670],
         [ 1.4445,  0.3796, -0.8511,  ..., -1.6125,  0.8517, -0.4844],
         [ 0.1766,  1.1295, -2.4132,  ...,  0.3841, -0.4750,  0.3182]],

        [[ 0.0578,  0.6110, -2.2818,  ...,  1.1585, -0.6495,  0.6791],
         [ 1.3612,  0.3967, -1.1474,  ..., -1.5234,  0.9994, -0.4785],
         [ 0.0827,  1.3164, -2.0204,  ...,  0.8324, -0.9535,  0.7439],
         ...,
         [ 0.1874,  1.1575, -2.4385,  ...,  0.2802, -0.3105,  0.1653],
         [ 1.4551,  0.3166, -0.8524,  ..., -1.5973,  0.8886, -0.4535],
         [ 0.1770,  1.1307, -2.4129,  ...,  0.3830, -0.4751,  0.3175]],

        [[ 0.4413,  1.4285, -1.9358,  ...,  0.2120, -0.5928,  1.1581],
         [ 1.3612,  0.3959, -1.1475,  ..., -1

In [50]:
print(encoded.size())
print("x1 contain nan? ",any(torch.isnan(x1.flatten())))
print("x1 contain inf? ",any(torch.isinf(x1.flatten())))
print("encode contain nan? ",any(torch.isnan(encoded.flatten())))
print("how many encode contain nan? ",sum(torch.isnan(encoded.flatten())))
print("decode contain nan? ",any(torch.isnan(decoded.flatten())))
print("how many encode contain nan? ",sum(torch.isnan(decoded.flatten())))

torch.Size([64, 32, 8])
x1 contain nan?  False
x1 contain inf?  False
encode contain nan?  False
how many encode contain nan?  tensor(0)
decode contain nan?  False
how many encode contain nan?  tensor(0)


In [54]:
#try to find which function cause the problem
for i in range(32):
    if any(torch.isnan(encoded[:,i,:]).flatten()):
        print(i)
#print(encoded[:,12,:])
print(x1.size())
print(x1[:10,24])
print(x1[:10,12])
print(x1[:10,1])
print(x1[:10,8])
print(encoded[:10,12,:])

torch.Size([512, 32])
tensor([-1.0000e+01, -1.0000e-15, -9.9216e+00, -1.1254e-15, -9.8431e+00,
        -1.2676e-15, -9.7647e+00, -1.4293e-15, -9.6863e+00, -1.6131e-15])
tensor([-1.0000e+01, -1.0000e-15, -9.9216e+00, -1.1254e-15, -9.8431e+00,
        -1.2676e-15, -9.7647e+00, -1.4293e-15, -9.6863e+00, -1.6131e-15])
tensor([-10.0000, -55.0000,  -9.9216, -54.5294,  -9.8431, -54.0588,  -9.7647,
        -53.5882,  -9.6863, -53.1176])
tensor([-10.0000,   0.0368,  -9.9216,   0.0370,  -9.8431,   0.0373,  -9.7647,
          0.0376,  -9.6863,   0.0379])
tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]], grad_fn=<SliceBackward0>)


In [51]:
def predict(self, tensor, pred_mask, y, get_scores):
        """
        Given the last hidden state, compute word scores and/or the loss.
            `pred_mask` is a ByteTensor of shape (slen, bs), filled with 1 when
                we need to predict a word
            `y` is a LongTensor of shape (pred_mask.sum(),)
            `get_scores` is a boolean specifying whether we need to return scores
        """
        x = tensor[pred_mask.unsqueeze(-1).expand_as(tensor)].view(-1, self.dim)
        assert (y == self.pad_index).sum().item() == 0
        scores = self.proj(x).view(-1, self.n_words)
        loss = F.cross_entropy(scores, y, reduction='mean')
        return scores, loss

In [52]:
import torch.nn as nn
import torch.nn.functional as F
tensor = decoded
dim = params.emb_dim
proj = nn.Linear(dim, params.n_words, bias=True)

x = tensor[pred_mask.unsqueeze(-1).expand_as(tensor)].view(-1, dim)
assert (y == params.pad_index).sum().item() == 0
scores = proj(x).view(-1, params.n_words)
loss = F.cross_entropy(scores, y, reduction='mean')
print("scores contain nan? ",any(torch.isnan(scores.flatten())))
print(loss)

scores contain nan?  False
tensor(4.3498, grad_fn=<NllLossBackward0>)


In [15]:
#encoded is all nan
_, loss = decoder('predict', tensor=decoded, pred_mask=pred_mask, y=y, get_scores=False)
#stats['loss'].append(loss.item())
loss

tensor(nan, grad_fn=<NllLossBackward0>)

In [17]:
#now I am going to enter the model inside
import src.model.transformer
from src.model.transformer import TransformerModel
#def __init__(self, params, id2word, is_encoder, with_output):]
#modules['encoder'] = TransformerModel(params, env.id2word, is_encoder=True, with_output=False)
myencoder= TransformerModel(params,env.id2word,is_encoder=True,with_output=False)
#def fwd(self, x, lengths, causal, src_enc=None, src_len=None, positions=None, cache=None, previous_state=None):
myencoded = myencoder.fwd(x1, len1, causal=False)
print(myencoded)

tensor([[[ 3.9202e-01,  1.1928e+00, -2.0482e+00,  ..., -2.4918e-01,
           8.5849e-02,  5.4456e-01],
         [ 6.4695e-01, -8.9447e-01, -3.3125e-01,  ..., -3.3541e-01,
           8.0515e-01, -1.9117e+00],
         [ 4.2913e-02,  7.9914e-01, -1.8029e+00,  ...,  1.0210e-01,
          -1.8410e-02,  2.0673e-01],
         ...,
         [ 2.7710e-03,  9.7096e-01, -2.0390e+00,  ..., -3.1017e-01,
           6.7325e-01, -2.9816e-02],
         [ 6.3322e-01, -7.3293e-01, -3.7819e-01,  ..., -3.3661e-01,
           6.8649e-01, -1.9870e+00],
         [ 7.1713e-02,  9.1111e-01, -2.0203e+00,  ..., -2.4973e-01,
           5.2250e-01,  8.4559e-02]],

        [[ 1.0520e-01,  5.3736e-01, -1.7004e+00,  ...,  2.0849e-01,
          -2.0089e-01,  3.4734e-01],
         [ 6.4704e-01, -8.9579e-01, -3.3084e-01,  ..., -3.3502e-01,
           8.0561e-01, -1.9112e+00],
         [ 4.2711e-02,  7.9933e-01, -1.8029e+00,  ...,  1.0206e-01,
          -1.8280e-02,  2.0657e-01],
         ...,
         [ 3.3663e-03,  9

In [18]:
print(encoder)

TransformerModel(
  (position_embeddings): Embedding(4096, 8)
  (embeddings): Embedding(61, 8, padding_idx=1)
  (layer_norm_emb): LayerNorm((8,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0): MultiHeadAttention(
      (q_lin): Linear(in_features=8, out_features=8, bias=True)
      (k_lin): Linear(in_features=8, out_features=8, bias=True)
      (v_lin): Linear(in_features=8, out_features=8, bias=True)
      (out_lin): Linear(in_features=8, out_features=8, bias=True)
    )
    (1): MultiHeadAttention(
      (q_lin): Linear(in_features=8, out_features=8, bias=True)
      (k_lin): Linear(in_features=8, out_features=8, bias=True)
      (v_lin): Linear(in_features=8, out_features=8, bias=True)
      (out_lin): Linear(in_features=8, out_features=8, bias=True)
    )
    (2): MultiHeadAttention(
      (q_lin): Linear(in_features=8, out_features=8, bias=True)
      (k_lin): Linear(in_features=8, out_features=8, bias=True)
      (v_lin): Linear(in_features=8, out_featur

In [19]:
list(encoder.named_parameters())

[('position_embeddings.weight',
  Parameter containing:
  tensor([[ 0.1324,  0.1120, -0.0131,  ...,  0.4450,  0.0107,  0.1481],
          [-0.1768,  0.0501, -0.7982,  ...,  0.0717,  1.2157,  0.2931],
          [ 0.1268, -0.0051, -0.3031,  ..., -0.0332, -0.0227, -0.1336],
          ...,
          [ 0.1672,  0.0328, -0.1899,  ..., -0.3327, -0.0050,  0.5133],
          [-0.4867, -0.1246,  0.1752,  ...,  0.0074,  0.4142,  0.1713],
          [-0.2636,  0.1546, -0.5727,  ..., -0.0280, -0.1110, -0.2587]],
         requires_grad=True)),
 ('embeddings.weight',
  Parameter containing:
  tensor([[-1.4631e-01,  1.7389e-01,  2.6802e-01,  2.0866e-01,  2.4720e-01,
           -3.0802e-01,  1.4755e-01, -2.4255e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
            0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 4.8880e-02,  6.7004e-01,  8.2589e-01, -2.7859e-01, -8.9150e-02,
           -6.4933e-01, -1.6716e-01,  7.4568e-02],
          [-4.9690e-01, -4.6571e-04, 

In [20]:
list(myencoder.named_parameters())

[('position_embeddings.weight',
  Parameter containing:
  tensor([[ 0.4593, -0.3789,  0.2590,  ..., -0.4748, -0.0722, -0.3089],
          [-0.0806,  0.0634, -0.1455,  ...,  0.0085,  0.0771,  0.3567],
          [-0.2189, -0.4403,  0.1464,  ...,  0.6006,  0.0298, -0.6730],
          ...,
          [-0.2781, -0.5924,  0.0908,  ..., -0.3901, -0.5659,  0.4490],
          [ 0.2950, -0.1289,  0.1580,  ..., -0.0771, -0.2744, -0.1959],
          [ 0.4075,  0.4249,  0.4409,  ...,  0.3851, -0.9871, -0.5471]],
         requires_grad=True)),
 ('embeddings.weight',
  Parameter containing:
  tensor([[-1.4591e-01, -8.4725e-01,  4.2503e-01, -1.7194e-01, -6.2619e-01,
            4.2097e-01,  3.5859e-01,  3.8833e-02],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
            0.0000e+00,  0.0000e+00,  0.0000e+00],
          [-7.5563e-02,  2.2542e-01,  7.5695e-01,  2.3786e-01, -1.8234e-01,
           -8.0094e-02,  3.8779e-01, -2.7975e-01],
          [-7.5215e-01, -2.5650e-02, 

In [90]:
#try to check the layer_norm_emb
layer_norm_emb = nn.LayerNorm(dim, eps=1e-12)
x = x1.transpose(0, 1)
tensor=x.view(32,-1,dim)
afternorm = layer_norm_emb(tensor)
print(afternorm.size())
print(afternorm[12,30:40,:])
print(tensor.size())
print(tensor[12,30:40,:])

torch.Size([32, 64, 8])
tensor([[ 0.4009,  0.3995,  0.4009,  0.3887,  0.4009,  0.2509,  0.4009, -2.6426],
        [    nan,     nan,     nan,     nan,     nan,     nan,     nan,     nan],
        [    nan,     nan,     nan,     nan,     nan,     nan,     nan,     nan],
        [-0.4009,  2.6426, -0.4009, -0.2509, -0.4009, -0.3887, -0.4009, -0.3995],
        [-0.4762,  2.6004, -0.4756,  0.1001, -0.4751, -0.3514, -0.4745, -0.4478],
        [ 0.3894,  1.1575,  0.5542, -0.7794,  0.7191, -1.3658,  0.8839, -1.5589],
        [ 0.8290, -0.9788,  0.9404, -0.9962,  1.0518, -1.0032,  1.1631, -1.0061],
        [ 0.8614, -0.9969,  0.9521, -0.9974,  1.0428, -0.9976,  1.1335, -0.9977],
        [ 0.8830, -0.9981,  0.9598, -0.9982,  1.0365, -0.9982,  1.1133, -0.9982],
        [ 0.8988, -0.9986,  0.9653, -0.9986,  1.0319, -0.9986,  1.0985, -0.9986]],
       grad_fn=<SliceBackward0>)
torch.Size([32, 64, 8])
tensor([[-5.8824e-01, -2.8624e+03, -5.0980e-01, -2.4488e+04, -4.3137e-01,
         -3.0006e+05, -3

In [107]:
test = x1[200:,12]
print(test)

tensor([-2.1569e+00, -9.8325e-06, -2.0784e+00, -1.7138e-05, -2.0000e+00,
        -3.0518e-05, -1.9216e+00, -5.5611e-05, -1.8431e+00, -1.0390e-04,
        -1.7647e+00, -1.9949e-04, -1.6863e+00, -3.9453e-04, -1.6078e+00,
        -8.0602e-04, -1.5294e+00, -1.7066e-03, -1.4510e+00, -3.7590e-03,
        -1.3725e+00, -8.6513e-03, -1.2941e+00, -2.0912e-02, -1.2157e+00,
        -5.3417e-02, -1.1373e+00, -1.4526e-01, -1.0588e+00, -4.2427e-01,
        -9.8039e-01, -1.3459e+00, -9.0196e-01, -4.7009e+00, -8.2353e-01,
        -1.8400e+01, -7.4510e-01, -8.2566e+01, -6.6667e-01, -4.3789e+02,
        -5.8824e-01, -2.8624e+03, -5.0980e-01, -2.4488e+04, -4.3137e-01,
        -3.0006e+05, -3.5294e-01, -6.0879e+06, -2.7451e-01, -2.6402e+08,
        -1.9608e-01, -4.1073e+10, -1.1765e-01, -8.7354e+13, -3.9216e-02,
        -1.2534e+21,  3.9216e-02,  1.2534e+21,  1.1765e-01,  8.7354e+13,
         1.9608e-01,  4.1073e+10,  2.7451e-01,  2.6402e+08,  3.5294e-01,
         6.0879e+06,  4.3137e-01,  3.0006e+05,  5.0

In [99]:
def big_value_filter(data_y):
        #input: data_y is the generated dataset
        #output: pos False is for valid value, True is for invalid value
        pos = np.isnan(data_y) #detect nan
        #pos=[0 if e else 1 for e in pos]
        for i,v in enumerate(data_y):
            #detect nan
            if (v >=1e10) or (v<=1e-10 and v>=-1e-10) or (v<=-1e10):
                pos[i]=True
        return pos

In [112]:
big_value_filter([1000000000000000])

array([ True])

In [63]:
any(test>1e10)

True

In [64]:
1e10

10000000000.0

In [65]:
print(test)

tensor([-1.0000e+01, -1.0000e-15, -9.9216e+00, -1.1254e-15, -9.8431e+00,
        -1.2676e-15, -9.7647e+00, -1.4293e-15, -9.6863e+00, -1.6131e-15,
        -9.6078e+00, -1.8223e-15, -9.5294e+00, -2.0607e-15, -9.4510e+00,
        -2.3326e-15, -9.3725e+00, -2.6432e-15, -9.2941e+00, -2.9983e-15,
        -9.2157e+00, -3.4047e-15, -9.1373e+00, -3.8705e-15, -9.0588e+00,
        -4.4048e-15, -8.9804e+00, -5.0185e-15, -8.9020e+00, -5.7242e-15,
        -8.8235e+00, -6.5368e-15, -8.7451e+00, -7.4736e-15, -8.6667e+00,
        -8.5550e-15, -8.5882e+00, -9.8049e-15, -8.5098e+00, -1.1251e-14,
        -8.4314e+00, -1.2928e-14, -8.3529e+00, -1.4873e-14, -8.2745e+00,
        -1.7134e-14, -8.1961e+00, -1.9766e-14, -8.1176e+00, -2.2832e-14,
        -8.0392e+00, -2.6412e-14, -7.9608e+00, -3.0596e-14, -7.8824e+00,
        -3.5495e-14, -7.8039e+00, -4.1239e-14, -7.7255e+00, -4.7985e-14,
        -7.6471e+00, -5.5922e-14, -7.5686e+00, -6.5274e-14, -7.4902e+00,
        -7.6314e-14, -7.4118e+00, -8.9367e-14, -7.3

In [69]:
10e-5

0.0001