In [1]:
# allreduce_tensor.py
import torch, time
import hivemind

# 1) Start or join the DHT (peer discovery/coordination)
#    First peer: dht = hivemind.DHT(start=True)
#    Other peers: put the printed address from the first peer into initial_peers=["..."]
dht = hivemind.DHT(start=True, client_mode = False)
print("Share this with other peers:", [str(a) for a in dht.get_visible_maddrs()])

Share this with other peers: ['/ip4/127.0.0.1/tcp/35493/p2p/12D3KooWFDdqSPnGkDfnt3po3AVemkeunY8NkP8mjDNYtTRiXE2d']


In [None]:
# sn owner:  ['/ip4/127.0.0.1/tcp/46065/p2p/12D3KooWBFxNgqoaYwdeF7v1gtkFnwRX8n2UqRSPLQJkEGHesE2N']

In [None]:

# 2) Make a tensor you'd like to all-reduce (sum/average)
local = torch.ones(4) * (torch.randint(1, 10, ()).item())  # e.g., [k, k, k, k]
print("local before:", local.tolist())

# 3) Create an averager for that tensor; all peers must use the SAME prefix
averager = hivemind.averaging.DecentralizedAverager(
    averaged_tensors=[local], dht=dht, start=True, prefix="demo/allreduce", target_group_size=4
)

local before: [5.0, 5.0, 5.0, 5.0]


Task exception was never retrieved
future: <Task finished name='Task-15' coro=<DecentralizedAverager._declare_for_download_periodically() done, defined at /home/isabella/crucible/subnet-MoE/.venv/lib/python3.10/site-packages/hivemind/averaging/averager.py:600> exception=RuntimeError('Broken pipe')>
Traceback (most recent call last):
  File "/home/isabella/crucible/subnet-MoE/.venv/lib/python3.10/site-packages/hivemind/averaging/averager.py", line 609, in _declare_for_download_periodically
    self.dht.store(
  File "/home/isabella/crucible/subnet-MoE/.venv/lib/python3.10/site-packages/hivemind/dht/dht.py", line 212, in store
    future = MPFuture()
  File "/home/isabella/crucible/subnet-MoE/.venv/lib/python3.10/site-packages/hivemind/utils/mpfuture.py", line 93, in __init__
    self._shared_state_code = SharedBytes.next()
  File "/home/isabella/crucible/subnet-MoE/.venv/lib/python3.10/site-packages/hivemind/utils/mpfuture.py", line 52, in next
    cls._buffer = torch.empty([buffer_size

In [13]:
local += torch.ones(4) * (torch.randint(1, 10, ()).item())
print("local before:", local.tolist())

# 4) Run one all-reduce round (blocks until a group forms or times out)
#    By default, it computes the *average* in-place; set averaging_alpha=1 to overwrite with the average.
#    You can pass weight=<float> to do a weighted average.
info = averager.step(timeout=30.0, gather = {'step': 21})
print("group info:", info)

# After step(), `local` now holds the averaged values from all peers in the group.
print("local after:", local.tolist())

local before: [36.5, 36.5, 36.5, 36.5]


KeyboardInterrupt: 

In [10]:
info

{<libp2p.peer.id.ID (12D3KooWFWDyCB2qF2QDjgQQz4KCxwxhpYtZppHjMMSkTHtGyfRY)>: None,
 <libp2p.peer.id.ID (12D3KooWKgseFFwQJ5rRTh17vfpGomZavLcQYtbxgMmgbXqEf2DN)>: None}

In [None]:

averager.shutdown()
dht.shutdown()


In [1]:
from mycelia.config import MinerConfig, ValidatorConfig, parse_args
from mycelia.shared.model import load_base_model 

rank = 0
config = ValidatorConfig() 
model, em = load_base_model(rank, config)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
names = []
for n, p in model.named_parameters():
    names.append(n)

names

['_orig_mod.model.embed_tokens.weight',
 '_orig_mod.model.layers.0.self_attn.q_a_proj.weight',
 '_orig_mod.model.layers.0.self_attn.q_a_layernorm.weight',
 '_orig_mod.model.layers.0.self_attn.q_b_proj.weight',
 '_orig_mod.model.layers.0.self_attn.kv_a_proj_with_mqa.weight',
 '_orig_mod.model.layers.0.self_attn.kv_a_layernorm.weight',
 '_orig_mod.model.layers.0.self_attn.kv_b_proj.weight',
 '_orig_mod.model.layers.0.self_attn.o_proj.weight',
 '_orig_mod.model.layers.0.mlp.gate_proj.weight',
 '_orig_mod.model.layers.0.mlp.up_proj.weight',
 '_orig_mod.model.layers.0.mlp.down_proj.weight',
 '_orig_mod.model.layers.0.input_layernorm.weight',
 '_orig_mod.model.layers.0.post_attention_layernorm.weight',
 '_orig_mod.model.layers.1.self_attn.q_a_proj.weight',
 '_orig_mod.model.layers.1.self_attn.q_a_layernorm.weight',
 '_orig_mod.model.layers.1.self_attn.q_b_proj.weight',
 '_orig_mod.model.layers.1.self_attn.kv_a_proj_with_mqa.weight',
 '_orig_mod.model.layers.1.self_attn.kv_a_layernorm.weight'

In [21]:
from mycelia.shared.modeling.modeling_mycelia import get_layer_expert_id
# from mycelia.validator.inter_validator_connection import iter_named_grads

all_named = list(iter_named_grads(model))
all_named.sort(key=lambda kv: kv[0])  # deterministic order
name_to_tensor = dict(all_named)

expert_group_to_names = {group_id: [] for group_id, _ in list(em.expert_group_assignment.values())[0].items()}

for name, p in name_to_tensor.items():
    layer_id, expert_id = get_layer_expert_id(name) 
    if layer_id and expert_id is not None:
        for group_id, expert_ids in em.expert_group_assignment[layer_id].items():
            if expert_id in expert_ids:
                expert_group_to_names[group_id].append(name)

expert_group_to_names

{0: [],
 1: ['_orig_mod.model.layers.1.mlp.experts.0.down_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.0.gate_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.0.up_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.2.down_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.2.gate_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.2.up_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.4.down_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.4.gate_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.4.up_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.5.down_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.5.gate_proj.weight',
  '_orig_mod.model.layers.1.mlp.experts.5.up_proj.weight',
  '_orig_mod.model.layers.11.mlp.experts.0.down_proj.weight',
  '_orig_mod.model.layers.11.mlp.experts.0.gate_proj.weight',
  '_orig_mod.model.layers.11.mlp.experts.0.up_proj.weight',
  '_orig_mod.model.layers.11.mlp.experts.4.down_proj.weight',
  '_orig_mod.model.

In [22]:
for n, p in model.named_parameters():
    print(n)

_orig_mod.model.embed_tokens.weight
_orig_mod.model.layers.0.self_attn.q_a_proj.weight
_orig_mod.model.layers.0.self_attn.q_a_layernorm.weight
_orig_mod.model.layers.0.self_attn.q_b_proj.weight
_orig_mod.model.layers.0.self_attn.kv_a_proj_with_mqa.weight
_orig_mod.model.layers.0.self_attn.kv_a_layernorm.weight
_orig_mod.model.layers.0.self_attn.kv_b_proj.weight
_orig_mod.model.layers.0.self_attn.o_proj.weight
_orig_mod.model.layers.0.mlp.gate_proj.weight
_orig_mod.model.layers.0.mlp.up_proj.weight
_orig_mod.model.layers.0.mlp.down_proj.weight
_orig_mod.model.layers.0.input_layernorm.weight
_orig_mod.model.layers.0.post_attention_layernorm.weight
_orig_mod.model.layers.1.self_attn.q_a_proj.weight
_orig_mod.model.layers.1.self_attn.q_a_layernorm.weight
_orig_mod.model.layers.1.self_attn.q_b_proj.weight
_orig_mod.model.layers.1.self_attn.kv_a_proj_with_mqa.weight
_orig_mod.model.layers.1.self_attn.kv_a_layernorm.weight
_orig_mod.model.layers.1.self_attn.kv_b_proj.weight
_orig_mod.model.la

In [19]:
import torch.nn as nn 

def iter_named_grads(model: nn.Module, skip_none: bool = True):
    """
    Yield (name, grad_tensor) for all model parameters that have gradients.
    """
    for n, p in model.named_parameters():
        # if p.grad is None:
        #     if not skip_none:
        #         yield n, None
        #     continue
        yield n, p.grad

# list(iter_named_grads(model))