Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The problems about train.py #4

Open
Yxs-160 opened this issue Mar 28, 2022 · 0 comments
Open

The problems about train.py #4

Yxs-160 opened this issue Mar 28, 2022 · 0 comments

Comments

@Yxs-160
Copy link

Yxs-160 commented Mar 28, 2022

https://github.com/FomalhautB/3D-RETR/blob/a359ec453a930cf507aab442fac57b7bae3029ce/train.py#L76
When i ran the raw codes, I have met a problem

python train.py --model_path SHAPENET_VOXEL --image_path SHAPENET_IMAGE --annot_path data/ShapeNet.json --transformer_config config/3d-retr-s.yaml --gpus 2
{ 'accelerator': None,
'accumulate_grad_batches': 1,
'amp_backend': 'native',
'amp_level': 'O2',
'annot_path': 'data/ShapeNet.json',
'auto_lr_find': False,
'auto_scale_batch_size': False,
'auto_select_gpus': False,
'background': (0, 0, 0),
'benchmark': False,
'check_val_every_n_epoch': 1,
'checkpoint_callback': True,
'continue_from': None,
'data_aug': False,
'default_root_dir': None,
'deterministic': False,
'distributed_backend': None,
'experiment_name': '3D-RETR',
'fast_dev_run': False,
'flush_logs_every_n_steps': 100,
'gpus': 2,
'gradient_clip_algorithm': 'norm',
'gradient_clip_val': 0.0,
'image_path': 'SHAPENET_IMAGE',
'limit_predict_batches': 1.0,
'limit_test_batches': 1.0,
'limit_train_batches': 1.0,
'limit_val_batches': 1.0,
'log_every_n_steps': 50,
'log_gpu_memory': None,
'logger': True,
'loss_type': 'dice',
'lr': 0.0001,
'max_epochs': None,
'max_steps': None,
'max_time': None,
'min_epochs': None,
'min_steps': None,
'model_path': 'SHAPENET_VOXEL',
'move_metrics_to_cpu': False,
'multiple_trainloader_mode': 'max_size_cycle',
'num_nodes': 1,
'num_processes': 1,
'num_sanity_val_steps': 2,
'num_workers': 8,
'overfit_batches': 0.0,
'plugins': None,
'precision': 32,
'prepare_data_per_node': True,
'process_position': 0,
'profiler': None,
'progress_bar_refresh_rate': None,
'reload_dataloaders_every_epoch': False,
'replace_sampler_ddp': True,
'resume_from_checkpoint': None,
'sample_batch_num': 0,
'sched_factor': 1,
'seed': 0,
'stochastic_weight_avg': False,
'sync_batchnorm': False,
'terminate_on_nan': False,
'threshold': 0.5,
'tpu_cores': None,
'track_grad_norm': -1,
'train_batch_size': 8,
'transformer_config': 'config/3d-retr-s.yaml',
'truncated_bptt_steps': None,
'val_batch_size': 8,
'val_check_interval': 1.0,
'view_num': 1,
'weights_save_path': None,
'weights_summary': 'top'}
Global seed set to 0
{ 'decoder_depth': 6,
'decoder_dim': 192,
'decoder_dropout': 0.4,
'decoder_heads': 3,
'decoder_model': 'cnn',
'encoder_dropout': 0.4,
'encoder_model': 'vit_deit_tiny_distilled_patch16_224',
'num_cnn_layers': 3,
'num_resnet_blocks': 2,
'patch_num': 4,
'voxel_size': 32}
Ignored parameter "head.weight" on loading
Ignored parameter "head.bias" on loading
Ignored parameter "head_dist.weight" on loading
Ignored parameter "head_dist.bias" on loading
/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: You requested multiple GPUs but did not specify a backend, e.g. Trainer(accelerator="dp"|"ddp"|"ddp2"). Setting accelerator="ddp_spawn" for you.
warnings.warn(*args, **kwargs)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Traceback (most recent call last):
File "train.py", line 155, in
trainer.fit(model, train_loader, val_loader)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
self._run(model)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
self.dispatch()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
self.accelerator.start_training(self)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 122, in start_training
mp.spawn(self.new_process, **self.mp_spawn_kwargs)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 179, in start_processes
process.start()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/process.py", line 105, in start
self._popen = self._Popen(self)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/context.py", line 284, in _Popen
return Popen(process_obj)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 32, in init
super().init(process_obj)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/popen_fork.py", line 19, in init
self._launch(process_obj)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47, in _launch
reduction.dump(process_obj, fp)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/multiprocessing/reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function at 0x7f5b76c48510>: attribute lookup on main failed

I try to solve it by https://stackoverflow.com/a/25353243/1791279 and to change this line of code to pickle.dumps(lambda x: x[0]), but still can not run the codes successfully, another error occured :

python train.py --model_path SHAPENET_VOXEL --image_path SHAPENET_IMAGE --annot_path data/ShapeNet.json --transformer_config config/3d-retr-s.yaml --gpus 2
{ 'accelerator': None,
'accumulate_grad_batches': 1,
'amp_backend': 'native',
'amp_level': 'O2',
'annot_path': 'data/ShapeNet.json',
'auto_lr_find': False,
'auto_scale_batch_size': False,
'auto_select_gpus': False,
'background': (0, 0, 0),
'benchmark': False,
'check_val_every_n_epoch': 1,
'checkpoint_callback': True,
'continue_from': None,
'data_aug': False,
'default_root_dir': None,
'deterministic': False,
'distributed_backend': None,
'experiment_name': '3D-RETR',
'fast_dev_run': False,
'flush_logs_every_n_steps': 100,
'gpus': 2,
'gradient_clip_algorithm': 'norm',
'gradient_clip_val': 0.0,
'image_path': 'SHAPENET_IMAGE',
'limit_predict_batches': 1.0,
'limit_test_batches': 1.0,
'limit_train_batches': 1.0,
'limit_val_batches': 1.0,
'log_every_n_steps': 50,
'log_gpu_memory': None,
'logger': True,
'loss_type': 'dice',
'lr': 0.0001,
'max_epochs': None,
'max_steps': None,
'max_time': None,
'min_epochs': None,
'min_steps': None,
'model_path': 'SHAPENET_VOXEL',
'move_metrics_to_cpu': False,
'multiple_trainloader_mode': 'max_size_cycle',
'num_nodes': 1,
'num_processes': 1,
'num_sanity_val_steps': 2,
'num_workers': 8,
'overfit_batches': 0.0,
'plugins': None,
'precision': 32,
'prepare_data_per_node': True,
'process_position': 0,
'profiler': None,
'progress_bar_refresh_rate': None,
'reload_dataloaders_every_epoch': False,
'replace_sampler_ddp': True,
'resume_from_checkpoint': None,
'sample_batch_num': 0,
'sched_factor': 1,
'seed': 0,
'stochastic_weight_avg': False,
'sync_batchnorm': False,
'terminate_on_nan': False,
'threshold': 0.5,
'tpu_cores': None,
'track_grad_norm': -1,
'train_batch_size': 8,
'transformer_config': 'config/3d-retr-s.yaml',
'truncated_bptt_steps': None,
'val_batch_size': 8,
'val_check_interval': 1.0,
'view_num': 1,
'weights_save_path': None,
'weights_summary': 'top'}
Global seed set to 0
{ 'decoder_depth': 6,
'decoder_dim': 192,
'decoder_dropout': 0.4,
'decoder_heads': 3,
'decoder_model': 'cnn',
'encoder_dropout': 0.4,
'encoder_model': 'vit_deit_tiny_distilled_patch16_224',
'num_cnn_layers': 3,
'num_resnet_blocks': 2,
'patch_num': 4,
'voxel_size': 32}
Ignored parameter "head.weight" on loading
Ignored parameter "head.bias" on loading
Ignored parameter "head_dist.weight" on loading
Ignored parameter "head_dist.bias" on loading
/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: You requested multiple GPUs but did not specify a backend, e.g. Trainer(accelerator="dp"|"ddp"|"ddp2"). Setting accelerator="ddp_spawn" for you.
warnings.warn(*args, **kwargs)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Global seed set to 0
initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/2
Global seed set to 0
initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/2

| Name | Type | Params

0 | encoder | VisionTransformerEncoder | 5.5 M
1 | decoder | VoxelDecoderCNN | 4.9 M

10.4 M Trainable params
0 Non-trainable params
10.4 M Total params
41.761 Total estimated model params size (MB)
Validation sanity check: 0it [00:00, ?it/s]/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: num_workers>0, persistent_workers=False, and accelerator=ddp_spawn may result in data loading bottlenecks. Consider setting persistent_workers=True (this is a limitation of Python .spawn() and PyTorch)
warnings.warn(*args, **kwargs)
Validation sanity check: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "train.py", line 155, in
trainer.fit(model, train_loader, val_loader)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
self._run(model)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
self.dispatch()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
self.accelerator.start_training(self)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 122, in start_training
mp.spawn(self.new_process, **self.mp_spawn_kwargs)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 172, in new_process
results = trainer.run_stage()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
return self.run_train()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 842, in run_train
self.run_sanity_check(self.lightning_module)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1107, in run_sanity_check
self.run_evaluation()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 949, in run_evaluation
for batch_idx, batch in enumerate(dataloader):
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 517, in next
data = self._next_data()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data
return self._process_data(data)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data
data.reraise()
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
data = fetcher.fetch(index)
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/anaconda3/envs/3d-retr/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/YaoXinSheng/3D-RETR-main/src/data/datasets.py", line 25, in getitem
return self._dataset[self._indices[index].item()]
File "/home/ubuntu/YaoXinSheng/3D-RETR-main/src/data/datasets.py", line 109, in getitem
image = self._image_transforms(image)
File "/home/ubuntu/YaoXinSheng/3D-RETR-main/src/data/transforms.py", line 32, in call
rendering_images = t(rendering_images)
TypeError: 'bytes' object is not callable

And I do not know how to solve it, could you help me?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant