From f6c5aaf6ce45e36fa9f0f88ba6a2f2b0d76ab34b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B3=8A=E9=9C=86?= Date: Thu, 23 Nov 2023 19:03:25 +0800 Subject: [PATCH] [ModelZoo] Prevent core dump problem of Saver function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when using distributed mode. Signed-off-by: 泊霆 --- modelzoo/bst/train.py | 3 ++- modelzoo/dbmtl/train.py | 3 ++- modelzoo/dcn/train.py | 3 ++- modelzoo/dcnv2/train.py | 3 ++- modelzoo/deepfm/train.py | 3 ++- modelzoo/dien/train.py | 6 +++--- modelzoo/din/train.py | 6 +++--- modelzoo/dlrm/train.py | 3 ++- modelzoo/dssm/train.py | 3 ++- modelzoo/esmm/train.py | 5 +++-- modelzoo/masknet/train.py | 3 ++- modelzoo/mlperf/train.py | 3 ++- modelzoo/mmoe/train.py | 3 ++- modelzoo/ple/train.py | 3 ++- modelzoo/simple_multitask/train.py | 5 +++-- modelzoo/wide_and_deep/train.py | 3 ++- 16 files changed, 36 insertions(+), 22 deletions(-) diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py index 2fb5e4e90f5..eeeb136678b 100644 --- a/modelzoo/bst/train.py +++ b/modelzoo/bst/train.py @@ -612,9 +612,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py index 24595073b95..c848cbc76b2 100644 --- a/modelzoo/dbmtl/train.py +++ b/modelzoo/dbmtl/train.py @@ -527,9 +527,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py index b8e1dba5d63..44701e22d9f 100644 --- a/modelzoo/dcn/train.py +++ b/modelzoo/dcn/train.py @@ -594,9 +594,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py index 7ac4c1a0358..5b572af0425 100644 --- a/modelzoo/dcnv2/train.py +++ b/modelzoo/dcnv2/train.py @@ -610,9 +610,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py index 896295b0ae6..166bedec0d0 100644 --- a/modelzoo/deepfm/train.py +++ b/modelzoo/deepfm/train.py @@ -472,9 +472,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py index 6c583c3ac19..190695f6ce0 100644 --- a/modelzoo/dien/train.py +++ b/modelzoo/dien/train.py @@ -776,10 +776,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.tables_initializer(), - tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py index 6273e0d15a4..058583ce6fd 100644 --- a/modelzoo/din/train.py +++ b/modelzoo/din/train.py @@ -594,10 +594,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.tables_initializer(), - tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py index 0789e9418b8..cc4c045c349 100644 --- a/modelzoo/dlrm/train.py +++ b/modelzoo/dlrm/train.py @@ -507,9 +507,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py index a757851711c..db949aac5e8 100644 --- a/modelzoo/dssm/train.py +++ b/modelzoo/dssm/train.py @@ -478,9 +478,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py index 58219e19e3e..073b08814d4 100755 --- a/modelzoo/esmm/train.py +++ b/modelzoo/esmm/train.py @@ -534,9 +534,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.local_variables_initializer(), train_init_op), - saver=tf.train.Saver(max_to_keep=keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=train_steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py index 0790f200b21..bb96a467701 100644 --- a/modelzoo/masknet/train.py +++ b/modelzoo/masknet/train.py @@ -529,9 +529,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py index db7e077250b..ce34fe5e55c 100644 --- a/modelzoo/mlperf/train.py +++ b/modelzoo/mlperf/train.py @@ -522,9 +522,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py index 251e02c7a72..694eb45da80 100644 --- a/modelzoo/mmoe/train.py +++ b/modelzoo/mmoe/train.py @@ -523,9 +523,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py index 2ba98363bbf..b2d2f2057ec 100644 --- a/modelzoo/ple/train.py +++ b/modelzoo/ple/train.py @@ -592,9 +592,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py index ff90946c96d..4ef1874a521 100644 --- a/modelzoo/simple_multitask/train.py +++ b/modelzoo/simple_multitask/train.py @@ -427,9 +427,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.local_variables_initializer(), train_init_op), - saver=tf.train.Saver(max_to_keep=keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=train_steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py index b4f4dbc7a65..3024f58024e 100644 --- a/modelzoo/wide_and_deep/train.py +++ b/modelzoo/wide_and_deep/train.py @@ -543,9 +543,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook(