Skip to content

Commit

Permalink
Make async implementation of gradient tracking and timeline png
Browse files Browse the repository at this point in the history
  • Loading branch information
bichengying committed May 11, 2020
1 parent 545e1b1 commit 6807e99
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 13 deletions.
Binary file added docs/_static/bf_timeline_example2a.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 6 additions & 4 deletions examples/pytorch_least_square.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def finalize_plot():
mse = []
for i in range(maxite):
grad = A.T.mm(A.mm(x)-b) # local gradient
psi = x - alpha * grad
psi = x - alpha_ed * grad
phi = psi + x - psi_prev
x = bf.neighbor_allreduce(phi, name='local variable')
psi_prev = psi
Expand Down Expand Up @@ -150,9 +150,11 @@ def finalize_plot():
alpha_gt = 5e-3 # step-size for GT (should be smaller than exact diffusion)
mse_gt = []
for i in range(maxite):
x = bf.neighbor_allreduce(x, name='local variable x') - alpha_gt * y
grad = A.T.mm(A.mm(x)-b) # local gradient at x^{k+1}
y = bf.neighbor_allreduce(y, name='local variable y') + grad - grad_prev
x_handle = bf.neighbor_allreduce_async(x, name='Grad.Tracking.x')
y_handle = bf.neighbor_allreduce_async(y, name='Grad.Tracking.y')
x = bf.synchronize(x_handle) - alpha_gt * y
grad = A.T.mm(A.mm(x)-b) # local gradient at x^{k+1}
y = bf.synchronize(y_handle) + grad - grad_prev # use async to overlap computation and communication
grad_prev = grad
if bf.rank() == 0:
mse_gt.append(torch.norm(x - x_opt, p=2))
Expand Down
18 changes: 9 additions & 9 deletions examples/pytorch_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,17 +209,17 @@ def logistic_loss_step(x_, tensor_name):
mse_gt = []
for i in range(maxite):
# w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k
w.data = bf.neighbor_allreduce(
w.data, name='local variable w') - alpha_gt * q

# calculate local gradient
logistic_loss_step(w, tensor_name='neighbor.allreduce.local variable w')
grad = w.grad.data.clone() # local gradient at w^{k+1}
w.grad.data.zero_()

# q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k)
q = bf.neighbor_allreduce(q, name='local variable q') + grad - grad_prev
# Notice the communication of neighbor_allreduce can overlap with gradient computation.
w_handle = bf.neighbor_allreduce_async(w.data, name='Grad.Tracking.w')
q_handle = bf.neighbor_allreduce_async(q, name='Grad.Tracking.q')
w.data = bf.synchronize(w_handle) - alpha_gt * q
# calculate local gradient
logistic_loss_step(w, tensor_name='neighbor.allreduce.Grad.Tracking.w')
grad = w.grad.data.clone()
q = bf.synchronize(q_handle) + grad - grad_prev
grad_prev = grad
w.grad.data.zero_()

# record convergence
if bf.rank() == 0:
Expand Down

0 comments on commit 6807e99

Please sign in to comment.