Skip to content

Commit

Permalink
[Dy2Sta]Fix Segment Fault while training multi-card if params have no…
Browse files Browse the repository at this point in the history
… grad (PaddlePaddle#44485)

* [Dy2Sta]Fix Segment Fault while training multi-card if params have no grad

* fix unittest
  • Loading branch information
Aurelius84 committed Jul 29, 2022
1 parent 0b6eef4 commit 9ced147
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 0 deletions.
21 changes: 21 additions & 0 deletions paddle/fluid/eager/to_static/run_program_op_func.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,23 @@
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/fluid/eager/utils.h"

// Filter params without grads in global block. In this case, we will
// tag its AutogradMeta with stop_gradient = True to avoid fault from
// reducer while training on multi-cards.
static void clear_no_grad_edges(
const std::vector<paddle::experimental::Tensor>& params,
const paddle::framework::BlockDesc* block_desc,
egr::GradNodeBase* grad_node,
size_t slot_id) {
for (size_t i = 0; i < params.size(); ++i) {
auto p_grad_name = paddle::framework::GradVarName(params[i].name());
if (!block_desc->HasVar(p_grad_name)) {
VLOG(1) << "clear edge of " << p_grad_name;
grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear();
}
}
}

inline void run_program_dygraph_function(
const std::vector<paddle::experimental::Tensor>& x,
const std::vector<paddle::experimental::Tensor>& params,
Expand Down Expand Up @@ -61,12 +78,16 @@ inline void run_program_dygraph_function(
grad_node->SetAttrMap(attrs);
// Set TensorWrappers
grad_node->SetFwdX(x);

grad_node->SetFwdParams(params);
grad_node->SetStepScope(step_scope);

// Set Grad out rank as same as fwd input and set stop gradient to bwd
grad_node->SetGradOutMeta(x, /*slot id*/ 0);
grad_node->SetGradOutMeta(params, /*slot id*/ 1);
auto* global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc*,
attrs.at("global_block"));
clear_no_grad_edges(params, global_block, grad_node.get(), /*slot id*/ 1);

grad_node->SetGradInMeta(deref_out, 0);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.distributed as dist

import unittest


class Net(nn.Layer):

def __init__(self):
super(Net, self).__init__()
self.emb1 = nn.Embedding(100, 16)
self.emb2 = nn.Embedding(100, 16)

def forward(self, ids):
feat1 = self.emb1(ids)
feat1.stop_gradient = True # here

feat2 = self.emb2(ids)

out = feat1 + feat2
out = paddle.mean(out)
return out


def train():
paddle.distributed.init_parallel_env()
net = Net()
net = paddle.jit.to_static(net)

sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=net.parameters())
dp_net = paddle.DataParallel(net)
for i in range(4):
x = paddle.randint(low=0, high=100, shape=[4, 10])
loss = dp_net(x)
loss.backward()
sgd.step()
loss.clear_gradient()
print(loss)


class TestParamsNoGrad(unittest.TestCase):

def test_two_card(self):
if paddle.is_compiled_with_cuda() and len(
paddle.static.cuda_places()) > 1:
dist.spawn(train, nprocs=2, gpus='0,1')


if __name__ == '__main__':
unittest.main()

0 comments on commit 9ced147

Please sign in to comment.