# Training cycle debugging

Basically run code copied from train_seq2seq.py in this notebook to catch bugs

In [1]:
import os
os.environ['ALFRED_ROOT'] = '/root/data/home/hoyeung/alfred/'

import sys
sys.path.append(os.path.join(os.environ['ALFRED_ROOT']))
sys.path.append(os.path.join(os.environ['ALFRED_ROOT'], 'models'))

# from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
import torch
import pprint
import json
from data.preprocess import Dataset
from importlib import import_module, reload
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from models.utils.helper_utils import optimizer_to

# Parser

In [2]:
# These are the default flags present in train_seq2seq.py

parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
args = parser.parse_args('')

# settings
args.seed = 123
args.data = '/root/data_alfred/json_feat_2.1.0'
args.splits = '/root/data_alfred/splits/apr25.json'
args.preprocess = False #!
args.pp_folder = 'pp'
args.monitor_train_every = 10
args.save_every_epoch = False #!
args.model = 'seq2seq_per_subgoal'
args.gpu = True
args.dout = 'exp/model:seq2seq_per_subgoal'
args.resume = False #!

# hyper parameters
args.batch = 8
args.epoch = 20
args.lr = 1e-4
args.decay_epoch = 10
args.dhid = 512
args.dframe = 2500
args.demb = 100
args.pframe = 300
args.mask_loss_wt = 1.
args.action_loss_wt = 1.
args.subgoal_aux_loss_wt = 0.
args.pm_aux_loss_wt = 0.

# dropouts
args.zero_goal = False #!
args.zero_instr = False #!
args.act_dropout = 0.
args.lang_dropout = 0.
args.input_dropout = 0.
args.vis_dropout = 0.3
args.hstate_dropout = 0.3
args.attn_dropout = 0.
args.actor_dropout = 0.
args.word_dropout = 0.

# other settings
args.train_teacher_forcing = False #!
args.train_student_forcing_prob = 0.1
args.temp_no_history = False #!

# debugging
args.fast_epoch = False #!
args.dataset_fraction = 0

In [3]:
# overwrite the default flags

args.preprocess = False # Turn this to True if running for the first time

args.model = 'seq2seq_per_subgoal'  # found under models/model/ directory
args.dout = '/root/data_alfred/exp/model:seq2seq_per_subgoal_fast_epoch'

args.train_teacher_forcing = True
args.gpu = False

# light setup for debugging
args.fast_epoch = True # Turn this to False if running for the first time to preprocess data properly
args.epoch = 20

In [4]:
torch.manual_seed(args.seed)

<torch._C.Generator at 0x7ff9880f15d0>

# Setup and load data splits

In [5]:
# make output dir
if not os.path.isdir(args.dout):
    os.makedirs(args.dout)

print('args.out ', args.dout)

args.out  /root/data_alfred/exp/model:seq2seq_per_subgoal_fast_epoch


In [6]:
# load train/valid/tests splits
with open(args.splits) as f:
    splits = json.load(f)
    pprint.pprint({k: len(v) for k, v in splits.items()})

{'tests_seen': 1533,
 'tests_unseen': 1529,
 'train': 21023,
 'train_sanity': 250,
 'train_sanity_v1': 250,
 'valid_seen': 820,
 'valid_seen_v1': 251,
 'valid_unseen': 821,
 'valid_unseen_v1': 255}


In [7]:
# preprocess and save -- only need to preprocess once
if args.preprocess:
    print("\nPreprocessing dataset and saving to %s folders ... This will take a while. Do this once as required." % args.pp_folder)
    dataset = Dataset(args, None)
    dataset.preprocess_splits(splits)
    vocab = torch.load(os.path.join(args.dout, "%s.vocab" % args.pp_folder))
else:
    vocab = torch.load(os.path.join(args.data, "%s.vocab" % args.pp_folder))
    
print(vocab)

{'action_high': Vocab(93), 'word': Vocab(2360), 'action_low': Vocab(15)}


# Model and Training

In [8]:
# load the model architecture
args.gpu = False
args.resume = None # os.path.join(args.dout, 'best_seen.pth')

M = import_module('model.{}'.format(args.model))
if args.resume:
    print("Loading: " + args.resume)
    model, optimizer, start_epoch = M.Module.load(args.resume)
    print("Restarting at epoch {}/{}".format(start_epoch, args.epoch-1))
    if start_epoch >= args.epoch:
        print('Checkpoint already finished {}/{} epochs.'.format(start_epoch, args.epoch))
        sys.exit(0)
else:
    model = M.Module(args, vocab)
    optimizer = None
    start_epoch = 0

if args.gpu:
    model = model.to(torch.device('cuda'))
    if not optimizer is None:
        optimizer_to(optimizer, torch.device('cuda'))

In [9]:
# examine model layers
model

Module(
  (emb_word): Embedding(2360, 100)
  (emb_action_low): Embedding(15, 100)
  (enc): ActionFrameAttnEncoder(
    (emb): Embedding(15, 100)
    (vis_dropout): Dropout(p=0.3, inplace=False)
    (act_dropout): Dropout(p=0.0, inplace=True)
    (vis_encoder): ResnetVisualEncoder(
      (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
      (conv2): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
      (fc): Linear(in_features=3136, out_features=2500, bias=True)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (encoder): LSTM(2600, 512, batch_first=True, bidirectional=True)
    (enc_att): SelfAttn(
      (scorer): Linear(in_features=1024, out_features=1, bias=True)
    )
  )
  (dec): LanguageDecoder(
    (emb): Embedding(2360, 100)
    (cell): LSTMCell(1124, 1024)
    (attn): DotAttn()
    (input_dropout): Dropout(p=0.0, inpl

In [10]:
# main training loop -- debug here if breakpoints were inserted
model.run_train(splits, optimizer=optimizer, start_epoch=start_epoch)

epoch:   0%|          | 0/20 [00:00<?, ?it/s]
  empty_tensor = torch.ones(torch.tensor(v[0][0][0]).unsqueeze(0).shape, device=device, dtype=torch.float if ('frames' in k) else torch.long) * self.pad
  seqs.append(torch.tensor(v[subgoal_i][batch_i], device=device, dtype=torch.float if ('frames' in k) else torch.long))


Saving to: /root/data_alfred/exp/model:seq2seq_per_subgoal_fast_epoch
> /root/data/home/hoyeung/alfred/models/model/seq2seq_per_subgoal.py(242)featurize()
-> return feat, time_report
(Pdb) feat.keys()
dict_keys(['lang_instr', 'action_low', 'frames', 'action_low_seq_lengths'])
(Pdb) len(feat['action_low'][0])
8
(Pdb) feat['action_low'][0]
tensor([[3, 5, 4, 4, 4, 4, 5, 8, 8, 8, 8, 8, 0, 0, 0, 0],
        [3, 4, 9, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 9, 4, 9, 4, 4, 4, 4, 5, 4, 4, 9, 4, 4, 4, 3],
        [3, 4, 9, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 4, 4, 4, 4, 5, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0],
        [3, 9, 4, 9, 4, 4, 4, 4, 5, 4, 4, 9, 4, 4, 4, 3],
        [3, 5, 4, 4, 4, 4, 5, 8, 8, 8, 8, 8, 0, 0, 0, 0],
        [3, 5, 4, 4, 4, 4, 5, 8, 8, 8, 8, 8, 0, 0, 0, 0]])
(Pdb) feat['action_low'][11]
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [2.],
        [0.],
        [0.],
        [0.]])
(Pdb) feat['action_low'][10]
tensor([[ 0.],
        [

(Pdb) feat['action_low'][9]
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [8., 9., 4., 4., 4., 4., 4., 4., 9., 4., 4., 9., 9., 8.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
(Pdb) feat['frames'][4]
tensor([[[[[0.0000e+00, 2.4614e-01, 0.0000e+00,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           [0.0000e+00, 8.3667e-01, 3.0116e-01,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           [8.4543e-01, 7.3377e-01, 0.0000e+00,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           ...,
           [0.0000e+00, 2.3158e-01, 6.5495e-01,  ..., 0.0000e+00,
            0.0000e+00, 0.00

(Pdb) feat['frames'][9][4]
tensor([[[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 6.6171e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 3.7147e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [0.0000e+00, 0.0000e+00, 6.4533e-02,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 5.6986e-01,  ..., 4.3932e-01,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 4.9974e-01,  ..., 4.5816e-01,
           0.0000e+00, 0.0000e+00]],

         [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           1.2515e-01, 4.3942e-03],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           3.9024e-01, 1.9211e-01],
          [2.0333e-01, 2.5544e-01, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [1.8873e+00, 1.8902e+0

(Pdb) feat['frames'][9][4].shape
torch.Size([14, 512, 7, 7])
(Pdb) feat['frames'][9][4][0]
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 6.6171e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 3.7147e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [0.0000e+00, 0.0000e+00, 6.4533e-02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 5.6986e-01,  ..., 4.3932e-01,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 4.9974e-01,  ..., 4.5816e-01,
          0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          1.2515e-01, 4.3942e-03],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          3.9024e-01, 1.9211e-01],
         [2.0333e-01, 2.5544e-01, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
  

batch:   0%|          | 0/2 [04:48<?, ?it/s]
epoch:   0%|          | 0/20 [04:48<?, ?it/s]


BdbQuit: 

In [11]:
# list length=max_num_subgoals, each (B, t, *) with T = max(l)
# list length=max_num_subgoals, each (B, t, 512, 7, 7) for k='frames'
assert all_pad_seqs[-1].shape[0] == batch_size
feat[k] = all_pad_seqs

TypeError: eq() received an invalid combination of arguments - got (NoneType), but expected one of:
 * (Tensor other)
      didn't match because some of the arguments have invalid types: ([31;1mNoneType[0m)
 * (Number other)
      didn't match because some of the arguments have invalid types: ([31;1mNoneType[0m)


In [11]:
type(None)

NoneType

In [14]:
isinstance(torch.tensor([1,2,3]), type(None))

False

In [15]:
isinstance(None, type(None))

True

NameError: name 'np' is not defined