Update the docs conf and pylint the examples

Bluefog-Lib · May 14, 2020 · fcae7aa · fcae7aa
1 parent a9df47a
commit fcae7aa
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 61 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -71,6 +71,13 @@
 #
 html_theme = 'sphinx_rtd_theme'
 
+html_theme_config = {
+    'analytics_id': 'UA-166722495-1',
+    'display_version': True,
+    'prev_next_buttons_location': 'bottom',
+    'style_external_links': False,
+}
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".

diff --git a/docs/timeline.rst b/docs/timeline.rst
@@ -79,7 +79,7 @@ improve the training efficiency in real practice.
 Example III: Resnet training with one-sided communication
 ---------------------------------------------------------
 In this example, we show the timeline for a real experiment when decentralized SGD is used to 
-train Resnet with CIFAR10 dataset. We exploit the one-sided communicaton primitive ``win_put'' 
+train Resnet with CIFAR10 dataset. We exploit the one-sided communicaton primitive ``win_put`` 
 to exchange information between ranks. It is observed that each phase during the training
 is clearly illustrated in the timeline.
 

diff --git a/examples/pytorch_benchmark.py b/examples/pytorch_benchmark.py
@@ -182,9 +182,11 @@ def log(s, nl=True):
 # Results
 img_sec_mean = np.mean(img_secs)
 img_sec_conf = 1.96 * np.std(img_secs)
-img_secs_sum = bf.allreduce(torch.from_numpy(np.array(img_secs)), average=False)
+img_secs_sum = bf.allreduce(torch.from_numpy(
+    np.array(img_secs)), average=False)
 img_sec_mean_all = np.mean(img_secs_sum.numpy())
 img_sec_conf_all = 1.96 * np.std(img_secs_sum.numpy())
-print('[%d] Img/sec per %s: %.1f +-%.1f' % (bf.rank(), device, img_sec_mean, img_sec_conf))
+print('[%d] Img/sec per %s: %.1f +-%.1f' %
+      (bf.rank(), device, img_sec_mean, img_sec_conf))
 log('Total img/sec on %d %s(s): %.1f +-%.1f' %
     (bf.size(), device, img_sec_mean_all, img_sec_conf_all))
diff --git a/examples/pytorch_cifar10_resnet.py b/examples/pytorch_cifar10_resnet.py
@@ -30,7 +30,8 @@
 import tensorboardX
 from tqdm import tqdm
 
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+sys.path.insert(0, os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")))
 
 cwd_folder_loc = os.path.dirname(os.path.abspath(__file__))
 # Training settings
@@ -66,7 +67,8 @@
 parser.add_argument(
     "--val-batch-size", type=int, default=32, help="input batch size for validation"
 )
-parser.add_argument("--epochs", type=int, default=50, help="number of epochs to train")
+parser.add_argument("--epochs", type=int, default=50,
+                    help="number of epochs to train")
 parser.add_argument(
     "--base-lr", type=float, default=0.0125, help="learning rate for a single GPU"
 )
@@ -136,7 +138,8 @@
 verbose = 1 if bf.rank() == 0 else 0
 
 # Bluefog: write TensorBoard logs on first worker.
-log_writer = tensorboardX.SummaryWriter(args.log_dir) if bf.rank() == 0 else None
+log_writer = tensorboardX.SummaryWriter(
+    args.log_dir) if bf.rank() == 0 else None
 
 
 kwargs = {"num_workers": 4, "pin_memory": True} if args.cuda else {}
@@ -147,7 +150,8 @@
     transform=transforms.Compose(
         [
             transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
+                                 0.229, 0.224, 0.225]),
         ]
     ),
 )
@@ -167,7 +171,8 @@
     transform=transforms.Compose(
         [
             transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
+                                 0.229, 0.224, 0.225]),
         ]
     ),
 )
@@ -249,8 +254,8 @@ def train(epoch):
             optimizer.zero_grad()
             # Split data into sub-batches of size batch_size
             for i in range(0, len(data), args.batch_size):
-                data_batch = data[i : i + args.batch_size]
-                target_batch = target[i : i + args.batch_size]
+                data_batch = data[i: i + args.batch_size]
+                target_batch = target[i: i + args.batch_size]
                 output = model(data_batch)
                 train_accuracy.update(accuracy(output, target_batch))
                 loss = F.cross_entropy(output, target_batch)
@@ -335,7 +340,8 @@ def save_checkpoint(epoch):
         dirpath = os.path.dirname(filepath)
         if not os.path.exists(dirpath):
             os.makedirs(dirpath)
-        state = {"model": model.state_dict(), "optimizer": optimizer.state_dict()}
+        state = {"model": model.state_dict(
+        ), "optimizer": optimizer.state_dict()}
         torch.save(state, filepath)
 
 
@@ -359,4 +365,3 @@ def avg(self):
     train(epoch)
     validate(epoch)
     save_checkpoint(epoch)
-
diff --git a/examples/pytorch_least_square.py b/examples/pytorch_least_square.py
@@ -35,12 +35,14 @@
 )
 args = parser.parse_args()
 
+
 def finalize_plot():
     plt.savefig(args.save_plot_file)
     if args.plot_interactive:
         plt.show()
     plt.close()
 
+
 bf.init()
 
 # The least squares problem is min_x \sum_i^n \|A_i x - b_i\|^2
@@ -117,7 +119,8 @@ def finalize_plot():
     # the norm of local gradient is expected not be be close to 0
     # this is because each rank converges to global solution, not local solution
     local_grad_norm = torch.norm(A.T.mm(A.mm(x) - b), p=2)
-    print("[ED] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
+    print("[ED] Rank {}: local gradient norm: {}".format(
+        bf.rank(), local_grad_norm))
 
     if bf.rank() == 0:
         plt.semilogy(mse)
@@ -147,14 +150,16 @@ def finalize_plot():
     x = torch.zeros(n, 1).to(torch.double)
     y = A.T.mm(A.mm(x)-b)
     grad_prev = y.clone()
-    alpha_gt = 5e-3  # step-size for GT (should be smaller than exact diffusion)
+    # step-size for GT (should be smaller than exact diffusion)
+    alpha_gt = 5e-3
     mse_gt = []
     for i in range(maxite):
-        x_handle = bf.neighbor_allreduce_async(x, name='Grad.Tracking.x') 
+        x_handle = bf.neighbor_allreduce_async(x, name='Grad.Tracking.x')
         y_handle = bf.neighbor_allreduce_async(y, name='Grad.Tracking.y')
         x = bf.synchronize(x_handle) - alpha_gt * y
-        grad = A.T.mm(A.mm(x)-b)    # local gradient at x^{k+1} 
-        y = bf.synchronize(y_handle) + grad - grad_prev  # use async to overlap computation and communication
+        grad = A.T.mm(A.mm(x)-b)    # local gradient at x^{k+1}
+        # use async to overlap computation and communication
+        y = bf.synchronize(y_handle) + grad - grad_prev
         grad_prev = grad
         if bf.rank() == 0:
             mse_gt.append(torch.norm(x - x_opt, p=2))
@@ -168,7 +173,8 @@ def finalize_plot():
     # the norm of local gradient is expected not be be close to 0
     # this is because each rank converges to global solution, not local solution
     local_grad_norm = torch.norm(A.T.mm(A.mm(x) - b), p=2)
-    print("[GT] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
+    print("[GT] Rank {}: local gradient norm: {}".format(
+        bf.rank(), local_grad_norm))
 
     if bf.rank() == 0:
         plt.semilogy(mse_gt)
@@ -208,7 +214,7 @@ def finalize_plot():
         bf.win_accumulate(
             w, name="w_buff",
             dst_weights={rank: 1.0 / (outdegree + 1)
-                        for rank in bf.out_neighbor_ranks()},
+                         for rank in bf.out_neighbor_ranks()},
             require_mutex=True)
         w.div_(1+outdegree)
         w = bf.win_sync_then_collect(name="w_buff")
@@ -233,7 +239,8 @@ def finalize_plot():
     # the norm of local gradient is expected not be be close to 0
     # this is because each rank converges to global solution, not local solution
     local_grad_norm = torch.norm(A.T.mm(A.mm(x) - b), p=2)
-    print("[PD] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
+    print("[PD] Rank {}: local gradient norm: {}".format(
+        bf.rank(), local_grad_norm))
 
     if bf.rank() == 0:
         plt.semilogy(mse_pd)

diff --git a/examples/pytorch_logistic_regression.py b/examples/pytorch_logistic_regression.py
@@ -36,12 +36,14 @@
 )
 args = parser.parse_args()
 
+
 def finalize_plot():
     plt.savefig(args.save_plot_file)
     if args.plot_interactive:
         plt.show()
     plt.close()
 
+
 bf.init()
 
 # The logistic regression problem is
@@ -61,6 +63,7 @@ def finalize_plot():
 y = 2*y - 1
 rho = 1e-2
 
+
 def logistic_loss_step(x_, tensor_name):
     """Calculate gradient of logistic loss via pytorch autograd."""
     with bf.timeline_context(tensor_name=tensor_name,
@@ -70,6 +73,7 @@ def logistic_loss_step(x_, tensor_name):
         loss_.backward()
     return loss_
 
+
 # # ================== Distributed gradient descent ================================
 # # Calculate the solution with distributed gradient descent:
 # # x^{k+1} = x^k - alpha * allreduce(local_grad)
@@ -168,7 +172,8 @@ def logistic_loss_step(x_, tensor_name):
     # the norm of local gradient is expected not be be close to 0
     # this is because each rank converges to global solution, not local solution
     local_grad_norm = torch.norm(w.grad.data, p=2)
-    print("[ED] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
+    print("[ED] Rank {}: local gradient norm: {}".format(
+        bf.rank(), local_grad_norm))
     w.grad.data.zero_()
 
     if bf.rank() == 0:
@@ -211,7 +216,7 @@ def logistic_loss_step(x_, tensor_name):
         # w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k
         # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k)
         # Notice the communication of neighbor_allreduce can overlap with gradient computation.
-        w_handle = bf.neighbor_allreduce_async(w.data, name='Grad.Tracking.w') 
+        w_handle = bf.neighbor_allreduce_async(w.data, name='Grad.Tracking.w')
         q_handle = bf.neighbor_allreduce_async(q, name='Grad.Tracking.q')
         w.data = bf.synchronize(w_handle) - alpha_gt * q
         # calculate local gradient
@@ -240,7 +245,8 @@ def logistic_loss_step(x_, tensor_name):
     # the norm of local gradient is expected not be be close to 0
     # this is because each rank converges to global solution, not local solution
     local_grad_norm = torch.norm(w.grad.data, p=2)
-    print("[GT] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
+    print("[GT] Rank {}: local gradient norm: {}".format(
+        bf.rank(), local_grad_norm))
     w.grad.data.zero_()
 
     if bf.rank() == 0:
@@ -281,7 +287,7 @@ def logistic_loss_step(x_, tensor_name):
         bf.win_accumulate(
             w, name="w_buff",
             dst_weights={rank: 1.0 / (outdegree*2)
-                        for rank in bf.out_neighbor_ranks()},
+                         for rank in bf.out_neighbor_ranks()},
             require_mutex=True)
         w.div_(2)
         w = bf.win_sync_then_collect(name="w_buff")
@@ -315,7 +321,8 @@ def logistic_loss_step(x_, tensor_name):
     # the norm of local gradient is expected not be be close to 0
     # this is because each rank converges to global solution, not local solution
     local_grad_norm = torch.norm(x.grad.data, p=2)
-    print("[PD] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
+    print("[PD] Rank {}: local gradient norm: {}".format(
+        bf.rank(), local_grad_norm))
     x.grad.data.zero_()
 
     if bf.rank() == 0:

diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
@@ -269,6 +269,7 @@ def test():
         )
     return test_loss, 100.0 * test_accuracy
 
+
 record = []
 for epoch in range(1, args.epochs + 1):
     train(epoch)