Make async implementation of gradient tracking and timeline png

Bluefog-Lib · May 11, 2020 · 6807e99 · 6807e99
1 parent 545e1b1
commit 6807e99
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 13 deletions.
diff --git a/docs/_static/bf_timeline_example2a.png b/docs/_static/bf_timeline_example2a.png
diff --git a/examples/pytorch_least_square.py b/examples/pytorch_least_square.py
@@ -101,7 +101,7 @@ def finalize_plot():
     mse = []
     for i in range(maxite):
         grad = A.T.mm(A.mm(x)-b)    # local gradient
-        psi = x - alpha * grad
+        psi = x - alpha_ed * grad
         phi = psi + x - psi_prev
         x = bf.neighbor_allreduce(phi, name='local variable')
         psi_prev = psi
@@ -150,9 +150,11 @@ def finalize_plot():
     alpha_gt = 5e-3  # step-size for GT (should be smaller than exact diffusion)
     mse_gt = []
     for i in range(maxite):
-        x = bf.neighbor_allreduce(x, name='local variable x') - alpha_gt * y
-        grad = A.T.mm(A.mm(x)-b)    # local gradient at x^{k+1}
-        y = bf.neighbor_allreduce(y, name='local variable y') + grad - grad_prev
+        x_handle = bf.neighbor_allreduce_async(x, name='Grad.Tracking.x') 
+        y_handle = bf.neighbor_allreduce_async(y, name='Grad.Tracking.y')
+        x = bf.synchronize(x_handle) - alpha_gt * y
+        grad = A.T.mm(A.mm(x)-b)    # local gradient at x^{k+1} 
+        y = bf.synchronize(y_handle) + grad - grad_prev  # use async to overlap computation and communication
         grad_prev = grad
         if bf.rank() == 0:
             mse_gt.append(torch.norm(x - x_opt, p=2))

diff --git a/examples/pytorch_logistic_regression.py b/examples/pytorch_logistic_regression.py
@@ -209,17 +209,17 @@ def logistic_loss_step(x_, tensor_name):
     mse_gt = []
     for i in range(maxite):
         # w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k
-        w.data = bf.neighbor_allreduce(
-            w.data, name='local variable w') - alpha_gt * q
-
-        # calculate local gradient
-        logistic_loss_step(w, tensor_name='neighbor.allreduce.local variable w')
-        grad = w.grad.data.clone()    # local gradient at w^{k+1}
-        w.grad.data.zero_()
-
         # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k)
-        q = bf.neighbor_allreduce(q, name='local variable q') + grad - grad_prev
+        # Notice the communication of neighbor_allreduce can overlap with gradient computation.
+        w_handle = bf.neighbor_allreduce_async(w.data, name='Grad.Tracking.w') 
+        q_handle = bf.neighbor_allreduce_async(q, name='Grad.Tracking.q')
+        w.data = bf.synchronize(w_handle) - alpha_gt * q
+        # calculate local gradient
+        logistic_loss_step(w, tensor_name='neighbor.allreduce.Grad.Tracking.w')
+        grad = w.grad.data.clone()
+        q = bf.synchronize(q_handle) + grad - grad_prev
         grad_prev = grad
+        w.grad.data.zero_()
 
         # record convergence
         if bf.rank() == 0: