Skip to content

Commit

Permalink
Update the docs conf and pylint the examples
Browse files Browse the repository at this point in the history
  • Loading branch information
bichengying committed May 14, 2020
1 parent a9df47a commit fcae7aa
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 61 deletions.
7 changes: 7 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@
#
html_theme = 'sphinx_rtd_theme'

html_theme_config = {
'analytics_id': 'UA-166722495-1',
'display_version': True,
'prev_next_buttons_location': 'bottom',
'style_external_links': False,
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
Expand Down
2 changes: 1 addition & 1 deletion docs/timeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ improve the training efficiency in real practice.
Example III: Resnet training with one-sided communication
---------------------------------------------------------
In this example, we show the timeline for a real experiment when decentralized SGD is used to
train Resnet with CIFAR10 dataset. We exploit the one-sided communicaton primitive ``win_put''
train Resnet with CIFAR10 dataset. We exploit the one-sided communicaton primitive ``win_put``
to exchange information between ranks. It is observed that each phase during the training
is clearly illustrated in the timeline.

Expand Down
6 changes: 4 additions & 2 deletions examples/pytorch_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,11 @@ def log(s, nl=True):
# Results
img_sec_mean = np.mean(img_secs)
img_sec_conf = 1.96 * np.std(img_secs)
img_secs_sum = bf.allreduce(torch.from_numpy(np.array(img_secs)), average=False)
img_secs_sum = bf.allreduce(torch.from_numpy(
np.array(img_secs)), average=False)
img_sec_mean_all = np.mean(img_secs_sum.numpy())
img_sec_conf_all = 1.96 * np.std(img_secs_sum.numpy())
print('[%d] Img/sec per %s: %.1f +-%.1f' % (bf.rank(), device, img_sec_mean, img_sec_conf))
print('[%d] Img/sec per %s: %.1f +-%.1f' %
(bf.rank(), device, img_sec_mean, img_sec_conf))
log('Total img/sec on %d %s(s): %.1f +-%.1f' %
(bf.size(), device, img_sec_mean_all, img_sec_conf_all))
23 changes: 14 additions & 9 deletions examples/pytorch_cifar10_resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
import tensorboardX
from tqdm import tqdm

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
sys.path.insert(0, os.path.abspath(
os.path.join(os.path.dirname(__file__), "..")))

cwd_folder_loc = os.path.dirname(os.path.abspath(__file__))
# Training settings
Expand Down Expand Up @@ -66,7 +67,8 @@
parser.add_argument(
"--val-batch-size", type=int, default=32, help="input batch size for validation"
)
parser.add_argument("--epochs", type=int, default=50, help="number of epochs to train")
parser.add_argument("--epochs", type=int, default=50,
help="number of epochs to train")
parser.add_argument(
"--base-lr", type=float, default=0.0125, help="learning rate for a single GPU"
)
Expand Down Expand Up @@ -136,7 +138,8 @@
verbose = 1 if bf.rank() == 0 else 0

# Bluefog: write TensorBoard logs on first worker.
log_writer = tensorboardX.SummaryWriter(args.log_dir) if bf.rank() == 0 else None
log_writer = tensorboardX.SummaryWriter(
args.log_dir) if bf.rank() == 0 else None


kwargs = {"num_workers": 4, "pin_memory": True} if args.cuda else {}
Expand All @@ -147,7 +150,8 @@
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
0.229, 0.224, 0.225]),
]
),
)
Expand All @@ -167,7 +171,8 @@
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
0.229, 0.224, 0.225]),
]
),
)
Expand Down Expand Up @@ -249,8 +254,8 @@ def train(epoch):
optimizer.zero_grad()
# Split data into sub-batches of size batch_size
for i in range(0, len(data), args.batch_size):
data_batch = data[i : i + args.batch_size]
target_batch = target[i : i + args.batch_size]
data_batch = data[i: i + args.batch_size]
target_batch = target[i: i + args.batch_size]
output = model(data_batch)
train_accuracy.update(accuracy(output, target_batch))
loss = F.cross_entropy(output, target_batch)
Expand Down Expand Up @@ -335,7 +340,8 @@ def save_checkpoint(epoch):
dirpath = os.path.dirname(filepath)
if not os.path.exists(dirpath):
os.makedirs(dirpath)
state = {"model": model.state_dict(), "optimizer": optimizer.state_dict()}
state = {"model": model.state_dict(
), "optimizer": optimizer.state_dict()}
torch.save(state, filepath)


Expand All @@ -359,4 +365,3 @@ def avg(self):
train(epoch)
validate(epoch)
save_checkpoint(epoch)

23 changes: 15 additions & 8 deletions examples/pytorch_least_square.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@
)
args = parser.parse_args()


def finalize_plot():
plt.savefig(args.save_plot_file)
if args.plot_interactive:
plt.show()
plt.close()


bf.init()

# The least squares problem is min_x \sum_i^n \|A_i x - b_i\|^2
Expand Down Expand Up @@ -117,7 +119,8 @@ def finalize_plot():
# the norm of local gradient is expected not be be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(A.T.mm(A.mm(x) - b), p=2)
print("[ED] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
print("[ED] Rank {}: local gradient norm: {}".format(
bf.rank(), local_grad_norm))

if bf.rank() == 0:
plt.semilogy(mse)
Expand Down Expand Up @@ -147,14 +150,16 @@ def finalize_plot():
x = torch.zeros(n, 1).to(torch.double)
y = A.T.mm(A.mm(x)-b)
grad_prev = y.clone()
alpha_gt = 5e-3 # step-size for GT (should be smaller than exact diffusion)
# step-size for GT (should be smaller than exact diffusion)
alpha_gt = 5e-3
mse_gt = []
for i in range(maxite):
x_handle = bf.neighbor_allreduce_async(x, name='Grad.Tracking.x')
x_handle = bf.neighbor_allreduce_async(x, name='Grad.Tracking.x')
y_handle = bf.neighbor_allreduce_async(y, name='Grad.Tracking.y')
x = bf.synchronize(x_handle) - alpha_gt * y
grad = A.T.mm(A.mm(x)-b) # local gradient at x^{k+1}
y = bf.synchronize(y_handle) + grad - grad_prev # use async to overlap computation and communication
grad = A.T.mm(A.mm(x)-b) # local gradient at x^{k+1}
# use async to overlap computation and communication
y = bf.synchronize(y_handle) + grad - grad_prev
grad_prev = grad
if bf.rank() == 0:
mse_gt.append(torch.norm(x - x_opt, p=2))
Expand All @@ -168,7 +173,8 @@ def finalize_plot():
# the norm of local gradient is expected not be be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(A.T.mm(A.mm(x) - b), p=2)
print("[GT] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
print("[GT] Rank {}: local gradient norm: {}".format(
bf.rank(), local_grad_norm))

if bf.rank() == 0:
plt.semilogy(mse_gt)
Expand Down Expand Up @@ -208,7 +214,7 @@ def finalize_plot():
bf.win_accumulate(
w, name="w_buff",
dst_weights={rank: 1.0 / (outdegree + 1)
for rank in bf.out_neighbor_ranks()},
for rank in bf.out_neighbor_ranks()},
require_mutex=True)
w.div_(1+outdegree)
w = bf.win_sync_then_collect(name="w_buff")
Expand All @@ -233,7 +239,8 @@ def finalize_plot():
# the norm of local gradient is expected not be be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(A.T.mm(A.mm(x) - b), p=2)
print("[PD] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
print("[PD] Rank {}: local gradient norm: {}".format(
bf.rank(), local_grad_norm))

if bf.rank() == 0:
plt.semilogy(mse_pd)
Expand Down
17 changes: 12 additions & 5 deletions examples/pytorch_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@
)
args = parser.parse_args()


def finalize_plot():
plt.savefig(args.save_plot_file)
if args.plot_interactive:
plt.show()
plt.close()


bf.init()

# The logistic regression problem is
Expand All @@ -61,6 +63,7 @@ def finalize_plot():
y = 2*y - 1
rho = 1e-2


def logistic_loss_step(x_, tensor_name):
"""Calculate gradient of logistic loss via pytorch autograd."""
with bf.timeline_context(tensor_name=tensor_name,
Expand All @@ -70,6 +73,7 @@ def logistic_loss_step(x_, tensor_name):
loss_.backward()
return loss_


# # ================== Distributed gradient descent ================================
# # Calculate the solution with distributed gradient descent:
# # x^{k+1} = x^k - alpha * allreduce(local_grad)
Expand Down Expand Up @@ -168,7 +172,8 @@ def logistic_loss_step(x_, tensor_name):
# the norm of local gradient is expected not be be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(w.grad.data, p=2)
print("[ED] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
print("[ED] Rank {}: local gradient norm: {}".format(
bf.rank(), local_grad_norm))
w.grad.data.zero_()

if bf.rank() == 0:
Expand Down Expand Up @@ -211,7 +216,7 @@ def logistic_loss_step(x_, tensor_name):
# w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k
# q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k)
# Notice the communication of neighbor_allreduce can overlap with gradient computation.
w_handle = bf.neighbor_allreduce_async(w.data, name='Grad.Tracking.w')
w_handle = bf.neighbor_allreduce_async(w.data, name='Grad.Tracking.w')
q_handle = bf.neighbor_allreduce_async(q, name='Grad.Tracking.q')
w.data = bf.synchronize(w_handle) - alpha_gt * q
# calculate local gradient
Expand Down Expand Up @@ -240,7 +245,8 @@ def logistic_loss_step(x_, tensor_name):
# the norm of local gradient is expected not be be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(w.grad.data, p=2)
print("[GT] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
print("[GT] Rank {}: local gradient norm: {}".format(
bf.rank(), local_grad_norm))
w.grad.data.zero_()

if bf.rank() == 0:
Expand Down Expand Up @@ -281,7 +287,7 @@ def logistic_loss_step(x_, tensor_name):
bf.win_accumulate(
w, name="w_buff",
dst_weights={rank: 1.0 / (outdegree*2)
for rank in bf.out_neighbor_ranks()},
for rank in bf.out_neighbor_ranks()},
require_mutex=True)
w.div_(2)
w = bf.win_sync_then_collect(name="w_buff")
Expand Down Expand Up @@ -315,7 +321,8 @@ def logistic_loss_step(x_, tensor_name):
# the norm of local gradient is expected not be be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(x.grad.data, p=2)
print("[PD] Rank {}: local gradient norm: {}".format(bf.rank(), local_grad_norm))
print("[PD] Rank {}: local gradient norm: {}".format(
bf.rank(), local_grad_norm))
x.grad.data.zero_()

if bf.rank() == 0:
Expand Down
1 change: 1 addition & 0 deletions examples/pytorch_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ def test():
)
return test_loss, 100.0 * test_accuracy


record = []
for epoch in range(1, args.epochs + 1):
train(epoch)
Expand Down

0 comments on commit fcae7aa

Please sign in to comment.