CrucibleComputingGroup · heroarmor · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+# Byte-compiled / cached Python
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+build/
+dist/
+*.egg-info/
+*.egg
+
+# Editor / IDE
+.vscode/
+.idea/
+*.swp
+
+# OS
+.DS_Store
diff --git a/scmp_kernels/__pycache__/__init__.cpython-311.pyc b/scmp_kernels/__pycache__/__init__.cpython-311.pyc
diff --git a/scmp_kernels/__pycache__/__init__.cpython-39.pyc b/scmp_kernels/__pycache__/__init__.cpython-39.pyc
diff --git a/scmp_kernels/sc/__pycache__/__init__.cpython-311.pyc b/scmp_kernels/sc/__pycache__/__init__.cpython-311.pyc
diff --git a/scmp_kernels/sc/__pycache__/__init__.cpython-39.pyc b/scmp_kernels/sc/__pycache__/__init__.cpython-39.pyc
diff --git a/scmp_kernels/sc/__pycache__/constants.cpython-311.pyc b/scmp_kernels/sc/__pycache__/constants.cpython-311.pyc
diff --git a/scmp_kernels/sc/__pycache__/kernels.cpython-311.pyc b/scmp_kernels/sc/__pycache__/kernels.cpython-311.pyc
diff --git a/scmp_kernels/sc/__pycache__/matmul.cpython-311.pyc b/scmp_kernels/sc/__pycache__/matmul.cpython-311.pyc
diff --git a/scmp_kernels/sc/__pycache__/matmul.cpython-39.pyc b/scmp_kernels/sc/__pycache__/matmul.cpython-39.pyc
diff --git a/scmp_kernels/sc/__pycache__/rng.cpython-311.pyc b/scmp_kernels/sc/__pycache__/rng.cpython-311.pyc
diff --git a/scmp_kernels/sc/__pycache__/sng.cpython-311.pyc b/scmp_kernels/sc/__pycache__/sng.cpython-311.pyc
diff --git a/scmp_kernels/sc/matmul.py b/scmp_kernels/sc/matmul.py
@@ -45,6 +45,7 @@ def sc_matmul(
     group_b: int = 1,
     rng_levels: Optional[int] = None,
     config: Optional[dict] = None,
+    halve_bipolar_stoc_len: bool = False,
 ) -> torch.Tensor:
     """Stochastic-computing matmul ``a @ b.T``.
 
@@ -83,6 +84,15 @@ def sc_matmul(
             specific integer to keep an int8 quant grid while varying
             ``stoc_len``.
         config: optional Sobol RNG/SNG config dict. Auto-built when ``None``.
+        halve_bipolar_stoc_len: enable the uSystolic / HUB sign-magnitude
+            cycle-halving optimization from wu-hpca2022. Bipolar magnitudes
+            only carry ``sc_prec - 1`` bits of information, so a stream and
+            RNG grid of size ``2 ** (sc_prec - 1)`` are sufficient. When
+            ``True`` and ``mode == "bipolar"``, any ``stoc_len`` /
+            ``rng_levels`` left at ``None`` are overridden to
+            ``2 ** (sc_prec - 1)`` (≈2× fewer cycles, same magnitude grid).
+            Has no effect when ``mode == "unipolar"``. Default ``False``
+            preserves legacy behavior.
 
     Returns:
         Output tensor. 2D inputs → ``(N, M)`` float32. 3D inputs → ``(BH, N, M)``
@@ -104,6 +114,17 @@ def sc_matmul(
             f"sc_matmul: unknown mode '{mode}'. "
             f"Expected one of {_VALID_MODES}.")
 
+    # ---- uSystolic / HUB sign-magnitude cycle-halving (wu-hpca2022) ----------
+    # Bipolar mode is already sign-magnitude with q_max = 2^(sc_prec-1) - 1,
+    # so the magnitude only spans 2^(sc_prec-1) levels. The default
+    # stoc_len = 2^sc_prec therefore wastes ~2× cycles for no resolution gain.
+    if halve_bipolar_stoc_len and mode == "bipolar":
+        halved = 2 ** (sc_prec - 1)
+        if stoc_len is None:
+            stoc_len = halved
+        if rng_levels is None:
+            rng_levels = halved
+
     # ---- chunk_d compatibility gate -----------------------------------------
     # Currently chunk_d is only implemented in the per-row + bipolar MLP
     # fast path. Other granularities and unipolar quantization will silently
@@ -148,7 +169,7 @@ def sc_matmul(
             max_fp_a=a_max, min_fp_a=a_min,
             max_fp_b=b_max, min_fp_b=b_min,
             mode=mode, sc_prec=sc_prec,
-            stoc_len=stoc_len, config=config,
+            stoc_len=stoc_len, rng_levels=rng_levels, config=config,
         )
 
     if granularity == "per_row":