## Loading a custom mode


In [1]:
import os

from mycelia.shared.config import MinerConfig
from mycelia.shared.expert_manager import ExpertManager
from mycelia.shared.modeling.custom_qwen3_next import CustomQwen3NextModel, get_moe_model_config

os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from __future__ import annotations

import torch
from transformers import AutoTokenizer

from mycelia.shared.config import ValidatorConfig
from mycelia.shared.dataloader import get_dataloader

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version 2.9.1 available.


In [2]:
# ---- Get config ----
rank = 0
config = ValidatorConfig()
config.task.data.batch_size = 1
config.task.data.sequence_length = 100



In [3]:
config = MinerConfig()
em = ExpertManager(config)



In [5]:
topk = 10
group_ids = None
moe_config = get_moe_model_config(config, topk, group_ids, em)
model = CustomQwen3NextModel(moe_config)

The fast path is not available because one of the required library is not installed. Falling back to torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and https://github.com/Dao-AILab/causal-conv1d


In [9]:
from mycelia.shared.expert_manager import get_layer_expert_id

state_dict = model.state_dict()
expert_groups = em.expert_group_assignment

# output buckets
grouped_state = {gid: {} for gid in expert_groups.keys()}
grouped_state["shared"] = {}

# Build fast-lookup structure:
# expert_lookup[(layer_id, org_expert_id)] = group_id
expert_lookup = {}  # maps (layer, expert) -> group_id
for gid, layer_map in expert_groups.items():
    for layer_id, mappings in layer_map.items():
        for my_eid, _ in mappings:
            expert_lookup[(layer_id, my_eid)] = gid

# Iterate model weights
for name, tensor in state_dict.items():
    layer_id, expert_id = get_layer_expert_id(name)

    # CASE 1: Not an expert parameter → goes to shared
    if layer_id is None or expert_id is None:
        grouped_state["shared"][name] = tensor
        continue

    # CASE 2: Check if this expert (layer_id, expert_id) belongs to any group
    key = (layer_id, expert_id)
    gid = expert_lookup.get(key, None)

    if gid is None:
        # expert exists but not assigned to any group → shared
        grouped_state["shared"][name] = tensor
    else:
        grouped_state[gid][name] = tensor

In [20]:
params_count = 0
for k, v in grouped_state[0].items():
    params_count += v.numel()

params_count / 1e9

0.905969664

In [21]:
grouped_state[0]

{'layers.0.mlp.experts.5.gate_proj.weight': tensor([[-0.0216,  0.0127, -0.0074,  ...,  0.0102,  0.0136, -0.0219],
         [ 0.0116, -0.0212,  0.0188,  ..., -0.0109, -0.0188,  0.0031],
         [ 0.0008,  0.0060,  0.0145,  ...,  0.0034, -0.0006,  0.0125],
         ...,
         [ 0.0087, -0.0172, -0.0099,  ...,  0.0102, -0.0078, -0.0029],
         [ 0.0122,  0.0089, -0.0202,  ...,  0.0099, -0.0061,  0.0015],
         [ 0.0087,  0.0031,  0.0092,  ...,  0.0030, -0.0103, -0.0046]]),
 'layers.0.mlp.experts.5.up_proj.weight': tensor([[ 0.0032,  0.0063,  0.0040,  ..., -0.0139,  0.0217, -0.0182],
         [ 0.0174, -0.0201, -0.0035,  ..., -0.0156, -0.0145,  0.0168],
         [-0.0171,  0.0077,  0.0163,  ...,  0.0144,  0.0108,  0.0004],
         ...,
         [ 0.0141, -0.0197,  0.0028,  ..., -0.0149,  0.0209,  0.0178],
         [ 0.0103,  0.0127, -0.0070,  ...,  0.0205,  0.0052, -0.0116],
         [-0.0078, -0.0209,  0.0156,  ..., -0.0053, -0.0191,  0.0163]]),
 'layers.0.mlp.experts.5.down_pr

In [None]:
# get partial model
topk = 10
group_ids = [0]
moe_config = get_moe_model_config(config, topk, group_ids, em.expert_group_assignment)

# get partial model
moe_config.num_experts = em.num_experts
partial_model = CustomQwen3NextModel(moe_config)

## Model check

In [5]:
gate = model.layers[0].mlp.gate  # .available_experts

In [8]:
_, routing_weights, selected_experts = gate.forward(torch.rand(200, 2048))

expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=25).permute(2, 1, 0)

expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
# routing_weights.sum(dim = 0).shape, expert_hit
expert_mask.sum(dim=1)

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [1, 0, 1,  ..., 1, 1, 0]])

## Test forward 

In [9]:
# ---- Get tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(config.model.model_path)

# ---- Get dataloader ----
train_dataloader = get_dataloader(config, rank=rank, world_size=config.task.data.world_size, tokenizer=tokenizer)
iter_dataloader = iter(train_dataloader)

Too many dataloader workers: 4 (max is dataset.n_shards=1). Stopping 3 dataloader workers.


Too many dataloader workers: 4 (max is dataset.n_shards=1). Stopping 3 dataloader workers.


In [10]:
model.eval()
outputs = []
for i in range(20):
    tokens = next(iter_dataloader)
    for k, v in tokens.items():
        tokens[k] = v[0]
    del tokens["labels"]
    with torch.no_grad():
        tokens.to(model.device)
        output = model(**tokens)
    outputs.append(output)

expert keys dict_keys(['5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24'])
expert_hit tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14],
        [15],
        [16],
        [17],
        [18],
        [19],
        [20],
        [21],
        [22],
        [23],
        [24]])
expert keys dict_keys(['5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24'])
expert_hit tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14],
        [15],
        [16],
        [17],
        [18],
        [19],
        [20],
        [21],
        [22],
        [23],
        [24]]

## loading a model 

In [4]:
import os

import torch

from mycelia.shared.config import ValidatorConfig
from mycelia.shared.dataloader import get_dataloader
from mycelia.shared.expert_manager import (
    ExpertManager,
)
from mycelia.shared.helper import *

[2m2025-12-05 09:13:20[0m [[32m[1minfo     [0m] [1mfound existing config path /home/isabella/crucible/subnet-MoE/checkpoints/validator/validator/hk1/foundation/config.yaml[0m [[0m[1m[34mmycelia.shared.config[0m][0m
[2m2025-12-05 09:13:25[0m [[32m[1minfo     [0m] [1mrank 0 setup training - load model and expert manager[0m [[0m[1m[34mmycelia.validator.run[0m][0m
[2m2025-12-05 09:13:25[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_dummy'),)[0m
[2m2025-12-05 09:13:25[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_math'),)[0m
[2m2025-12-05 09:13:27[0m [[32m[1minfo     [0m] [1mfetching model from chain   [0m [

The fast path is not available because one of the required library is not installed. Falling back to torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and https://github.com/Dao-AILab/causal-conv1d


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 47.40 GiB of which 16.06 MiB is free. Process 3774074 has 44.55 GiB memory in use. Including non-PyTorch memory, this process has 2.82 GiB memory in use. Of the allocated memory 2.57 GiB is allocated by PyTorch, and 1.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
from mycelia.shared.config import ValidatorConfig
from mycelia.shared.expert_manager import ExpertManager

config = ValidatorConfig.from_path(
    "/home/isabella/crucible/subnet-MoE/checkpoints/validator/validator/hk1/foundation/config.yaml"
)
expert_manager = ExpertManager(config)

[2m2025-12-05 09:18:43[0m [[32m[1minfo     [0m] [1mfound existing config path /home/isabella/crucible/subnet-MoE/checkpoints/validator/validator/hk1/foundation/config.yaml[0m [[0m[1m[34mmycelia.shared.config[0m][0m
[2m2025-12-05 09:18:43[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_dummy'),)[0m
[2m2025-12-05 09:18:43[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_math'),)[0m


In [5]:
expert_manager.expert_group_assignment[1].keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47])

## Set up training 

In [1]:
from mycelia.shared.config import ValidatorConfig, MinerConfig
from mycelia.shared.expert_manager import ExpertManager
from mycelia.miner.train import *
from mycelia.shared.modeling import *

config = MinerConfig.from_path(
    "/home/isabella/crucible/subnet-MoE/checkpoints/miner/miner/hk1/foundation/config.yaml"
)
expert_manager = ExpertManager(config)

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version 2.9.1 available.
[2m2025-12-20 21:39:18[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_dummy'),)[0m
[2m2025-12-20 21:39:18[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_math'),)[0m


In [2]:
device = "cuda:3"
# === set up chain worker ===
wallet, subtensor = setup_chain_worker(config)

rank = 0 

tokenizer = get_base_tokenizer(config)

(
    model,
    inner_optimizer,
    inner_scaler,
    scheduler,
    expert_manager,
    train_dataloader,
    current_model_meta,
) = setup_training(config, rank, device, tokenizer, subtensor, wallet, current_model_meta=None)

[2m2025-12-20 21:39:22[0m [[32m[1minfo     [0m] [1m(0) Setup training          [0m [[0m[1m[34mmycelia.miner.train[0m][0m
[2m2025-12-20 21:39:22[0m [[32m[1minfo     [0m] [1minit - model and expert manager[0m [[0m[1m[34mmycelia.miner.train[0m][0m
[2m2025-12-20 21:39:22[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_dummy'),)[0m
[2m2025-12-20 21:39:22[0m [[32m[1minfo     [0m] [1mload_expert_group_assignment - folder[0m [[0m[1m[34mmycelia.shared.expert_manager[0m][0m [36mpositional_args[0m=[35m(PosixPath('/home/isabella/crucible/subnet-MoE/expert_groups/exp_math'),)[0m
[2m2025-12-20 21:39:22[0m [[32m[1minfo     [0m] [1mGet resume info from folder [0m [[0m[1m[34mmycelia.shared.checkpoint[0m][0m [36mmodel_meta[0m=[35mModelMeta(global_ver=254, inner_opt=0, path=Po

The fast path is not available because one of the required library is not installed. Falling back to torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and https://github.com/Dao-AILab/causal-conv1d


[2m2025-12-20 21:41:02[0m [[32m[1minfo     [0m] [1mGet resume info from folder [0m [[0m[1m[34mmycelia.shared.checkpoint[0m][0m [36mmodel_meta[0m=[35mModelMeta(global_ver=254, inner_opt=0, path=PosixPath('/home/isabella/crucible/subnet-MoE/checkpoints/miner/validator_checkpoint/uid_16_hotkey_5DoHdXfDYraqPzkLjrXGMZxvGXYdDYhuC8tGbQdb4zvz2LbH_globalver_254'), role=None, model_hash=None)[0m [36mpath[0m=[35m{PosixPath('/home/isabella/crucible/subnet-MoE/checkpoints/miner/miner/hk1/foundation')}[0m [36mresult[0m=[35mfound[0m
[2m2025-12-20 21:41:02[0m [[32m[1minfo     [0m] [1mGet resume info from folder [0m [[0m[1m[34mmycelia.shared.checkpoint[0m][0m [36mmodel_meta[0m=[35mModelMeta(global_ver=254, inner_opt=0, path=PosixPath('/home/isabella/crucible/subnet-MoE/checkpoints/miner/miner/hk1/foundation/globalver_254_inneropt_0'), role=None, model_hash=None)[0m [36mpath[0m=[35m{PosixPath('/home/isabella/crucible/subnet-MoE/checkpoints/miner/miner/hk1/found

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def print_trainable_parameters(model):
    for n, p in model.named_parameters():
        if p.requires_grad:
            print(n)

def model_require_grad(model):
    p = next((p for p in model.parameters() if p is not None), None)
    if p is not None:
        logger.info("example param requires_grad:", p.requires_grad)

print('total_param', count_parameters(model) / 1e9) 
print('trainable param', count_trainable_parameters(model) / 1e9)
model_require_grad(model)

def required_mem(trainable_param, total_param, bytes_per_param=2):
    """
    Returns required memory in bytes for model parameters + gradients.

    trainable_param: number of trainable parameters
    total_param: total number of parameters
    bytes_per_param: 2 for fp16, 4 for fp32
    """
    non_grad_param = total_param - trainable_param

    mem_bytes = (
        trainable_param * bytes_per_param * 4 +
        non_grad_param * bytes_per_param
    )

    return mem_bytes


required_mem(count_trainable_parameters(model), count_parameters(model)) / 1e9

total_param 3.221699328
trainable param 0.905969664
[2m2025-12-21 07:34:16[0m [[32m[1minfo     [0m] [1mexample param requires_grad:[0m [[0m[1m[34mmycelia.miner.train[0m][0m [36mpositional_args[0m=[35m(False,)[0m


11.87921664

In [20]:
torch.tensor([5]).item()

5

In [7]:

import torch.nn.functional as F
from transformers import AutoConfig, PretrainedConfig


AutoConfig.from_pretrained(config.model.model_path)

Qwen3NextConfig {
  "architectures": [
    "Qwen3NextForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "decoder_sparse_step": 1,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "full_attention_interval": 4,
  "head_dim": 256,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5120,
  "layer_types": [
    "linear_attention",
    "linear_attention",
    "linear_attention",
    "full_attention",
    "linear_attention",
    "linear_attention",
    "linear_attention",
    "full_attention",
    "linear_attention",
    "linear_attention",
    "linear_attention",
    "full_attention",
    "linear_attention",
    "linear_attention",
    "linear_attention",
    "full_attention",
    "linear_attention",
    "linear_attention",
    "linear_attention",
    "full_attention",
    "linear_attention",
    "linear_attention",
    "linear_attention",
    "full_attention",
    "linear_attention",
    "lin

In [17]:
import torch 
def _compute_overlap(expert_hit, available_experts): 
    expert_hit_set = set(expert_hit.detach().cpu().flatten().tolist()) 
    print(expert_hit_set)
    available_experts_set = set(available_experts.tolist())
    return torch.tensor(sorted(expert_hit_set.intersection(available_experts_set))).view(-1, 1)

available_experts = torch.tensor([ 5, 6, 7, 8, 9, 10]) 
expert_hit = torch.tensor([[ 0], [ 1], [ 5], [ 6], [ 7], [ 8], [ 9], [10]], device='cuda:0') 

_compute_overlap(expert_hit, available_experts)

{0, 1, 5, 6, 7, 8, 9, 10}


tensor([[ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10]])

In [13]:
for e in expert_hit:
    print(e)

tensor([0], device='cuda:0')
tensor([1], device='cuda:0')
tensor([5], device='cuda:0')
tensor([6], device='cuda:0')
tensor([7], device='cuda:0')
tensor([8], device='cuda:0')
tensor([9], device='cuda:0')
tensor([10], device='cuda:0')


## Dataloader

In [4]:
from mycelia.miner.train import * 

config = MinerConfig.from_path("/home/isabella/crucible/subnet-MoE/checkpoints/miner/miner/hk1/foundation/config.yaml")
eval_rref = None
rank = 0
if rank == 0:
    config.write()

# === set up chain worker ===
wallet, subtensor = setup_chain_worker(config)

# === mis ===
device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
tokenizer = get_base_tokenizer(config)

(
    model,
    inner_optimizer,
    inner_scaler,
    scheduler,
    expert_manager,
    train_dataloader,
    current_model_meta,
) = setup_training(config, rank, device, tokenizer, subtensor, wallet, current_model_meta=None)

[2m2026-01-02 23:20:22[0m [[32m[1minfo     [0m] [1mWrote config to /home/isabella/crucible/subnet-MoE/checkpoints/miner/miner/hk1/foundation/config.yaml[0m [[0m[1m[34mmycelia.shared.config[0m][0m
[2m2026-01-02 23:20:26[0m [[32m[1minfo     [0m] [1m(0) Setup training          [0m [[0m[1m[34mmycelia.miner.train[0m][0m
[2m2026-01-02 23:20:26[0m [[32m[1minfo     [0m] [1minit - model and expert manager[0m [[0m[1m[34mmycelia.miner.train[0m][0m
[2m2026-01-02 23:20:26[0m [[32m[1minfo     [0m] [1mStart model from            [0m [[0m[1m[34mmycelia.shared.checkpoint[0m][0m [36mpositional_args[0m=[35m(ModelMeta(global_ver=918, inner_opt=0, path=PosixPath('/home/isabella/crucible/subnet-MoE/checkpoints/miner/validator_checkpoint/uid_16_hotkey_5DoHdXfDYraqPzkLjrXGMZxvGXYdDYhuC8tGbQdb4zvz2LbH_globalver_918'), role=None, model_hash=None),)[0m
[2m2026-01-02 23:20:28[0m [[32m[1minfo     [0m] [1mScan chain - Max model version on chain[0m [[0m[1m

The fast path is not available because one of the required library is not installed. Falling back to torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and https://github.com/Dao-AILab/causal-conv1d


[2m2026-01-02 23:22:43[0m [[32m[1minfo     [0m] [1mStart model from            [0m [[0m[1m[34mmycelia.shared.checkpoint[0m][0m [36mpositional_args[0m=[35m(ModelMeta(global_ver=918, inner_opt=0, path=PosixPath('/home/isabella/crucible/subnet-MoE/checkpoints/miner/validator_checkpoint/uid_16_hotkey_5DoHdXfDYraqPzkLjrXGMZxvGXYdDYhuC8tGbQdb4zvz2LbH_globalver_918'), role=None, model_hash=None),)[0m
[2m2026-01-02 23:22:44[0m [[32m[1minfo     [0m] [1mloaded checkpoint file      [0m [[0m[1m[34mmycelia.shared.checkpoint[0m][0m [36mloss[0m=[35m-1[0m [36mpath[0m=[35m<OpenFile '/home/isabella/crucible/subnet-MoE/checkpoints/miner/validator_checkpoint/uid_16_hotkey_5DoHdXfDYraqPzkLjrXGMZxvGXYdDYhuC8tGbQdb4zvz2LbH_globalver_918/model_expgroup_0.pt'>[0m
[2m2026-01-02 23:22:44[0m [[32m[1minfo     [0m] [1minit - optimizer            [0m [[0m[1m[34mmycelia.miner.train[0m][0m
[2m2026-01-02 23:22:44[0m [[32m[1minfo     [0m] [1minit - scheduler         

In [None]:
config.task.data.vali

DataCfg(dataset_name='open-r1/OpenR1-Math-220k', data_dir=None, batch_size=1, sequence_length=100, per_device_train_batch_size=2, world_size=10, rank=1, dataset_class='expert_groups.exp_math.dataset:StreamingTorchDataset', vali_fraction=0.1)

In [5]:
train_dataloader

<torchdata.stateful_dataloader.stateful_dataloader.StatefulDataLoader at 0x77cfee938c10>

In [25]:
# load state dict 

import torch 

sd = torch.load("/home/isabella/crucible/subnet-MoE/checkpoints/miner/validator_checkpoint/uid_15_hotkey_5DUqbhkpvth1WzRogVU9NvGeV9bwX3wBHFWHMZk72WEhZjHF_globalver_None/model_expgroup_0.pt")

In [26]:
for k, v in sd['model_state_dict'].items():
    print(k, torch.isfinite(v).all())

model.layers.0.mlp.experts.5.gate_proj.weight tensor(True)
model.layers.0.mlp.experts.5.up_proj.weight tensor(True)
model.layers.0.mlp.experts.5.down_proj.weight tensor(True)
model.layers.0.mlp.experts.6.gate_proj.weight tensor(True)
model.layers.0.mlp.experts.6.up_proj.weight tensor(True)
model.layers.0.mlp.experts.6.down_proj.weight tensor(True)
model.layers.0.mlp.experts.7.gate_proj.weight tensor(True)
model.layers.0.mlp.experts.7.up_proj.weight tensor(True)
model.layers.0.mlp.experts.7.down_proj.weight tensor(True)
model.layers.0.mlp.experts.8.gate_proj.weight tensor(True)
model.layers.0.mlp.experts.8.up_proj.weight tensor(True)
model.layers.0.mlp.experts.8.down_proj.weight tensor(True)
model.layers.0.mlp.experts.9.gate_proj.weight tensor(True)
model.layers.0.mlp.experts.9.up_proj.weight tensor(True)
model.layers.0.mlp.experts.9.down_proj.weight tensor(True)
model.layers.0.mlp.experts.10.gate_proj.weight tensor(True)
model.layers.0.mlp.experts.10.up_proj.weight tensor(True)
model.l

In [29]:
raw_ver = None
global_ver = raw_ver if isinstance(raw_ver, int) else 0
global_ver

0