In [1]:
# ─── Core dependencies ────────────────────────────────────────────────
import torch
import numpy as np
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

# ─── DeepSynergy modules ─────────────────────────────────────────────
from deepsynergy import decoders
from deepsynergy.utils_training import train_decoder

# ─── Device configuration ────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Decoder Sanity Check Notebook

This notebook validates the correctness of individual decoder heads in DeepSynergy.

Each section constructs a *synthetic* conditional distribution \( p(x \mid z) \) with known entropy. A corresponding **decoder** is then trained to approximate this distribution using neural networks. If successful, the decoder's average cross-entropy will closely match the theoretical conditional entropy \( H(X \mid Z) \).

This serves both as a test of decoder implementation and as a diagnostic tool for debugging.


## 1 · BinaryDecoder — Binary Symmetric Channel

We simulate a binary-symmetric channel (BSC) where:

- $ Z \in \{0, 1\} $ with $ \mathbb{P}(Z = 1) = 0.5 $
- The output $X$ is a noisy copy of $ Z $, flipped with probability $ \varepsilon $

This models the conditional distribution:
$$
p(x \mid z) = 
\begin{cases}
1 - \varepsilon & \text{if } x = z \\
\varepsilon & \text{if } x \neq z
\end{cases}
$$

The theoretical conditional entropy is:
$$
H(X \mid Z) = -\varepsilon \log_2 \varepsilon - (1 - \varepsilon)\log_2(1 - \varepsilon)
$$


In [3]:
# ─── Parameters ──────────────────────────────────────────────────────
samples  = 10_000       # Number of (Z, X) pairs
epsilon  = 0.10         # Bit-flip probability (channel noise)

# ─── Theoretical entropy H(X|Z) for binary-symmetric channel ─────────
H_theory = -epsilon * np.log2(epsilon) - (1 - epsilon) * np.log2(1 - epsilon)

# ─── Synthetic dataset generation ────────────────────────────────────
Z_np = np.random.randint(0, 2, size=(samples, 1))                     # Binary source
noise = (np.random.rand(samples, 1) < epsilon).astype(np.int32)
X_np = np.bitwise_xor(Z_np, noise).astype(np.float32)                 # Flip with prob ε

# ─── Torch tensors + dataloader ─────────────────────────────────────
Z = torch.FloatTensor(Z_np)
X = torch.FloatTensor(X_np)
dataloader = DataLoader(TensorDataset(Z, X), batch_size=samples)

# ─── Decoder network setup ───────────────────────────────────────────
decoder = decoders.BinaryDecoder(
    nn.Sequential(
        nn.Linear(1, 16), nn.ReLU(),
        nn.Linear(16, 16), nn.ReLU(),
        nn.Linear(16, 1)                    # One logit per binary variable
    )
).to(device)

# ─── Train decoder to approximate p(x | z) ───────────────────────────
optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-3)
decoder_results = train_decoder(
    model         = decoder,
    dataloader    = dataloader,
    optimizer     = optimizer,
    show_progress = True,
    device        = device,
    epochs        = 1000
)

# ─── Compare analytical and empirical entropy ────────────────────────
H_decoder = decoder_results['loss'][0]

print(f"H(X|Z)  analytic : {H_theory:.3f} bits")
print(f"H(X|Z)  decoder  : {H_decoder:.3f} bits")


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [01:31<00:00, 10.97it/s, loss=[0.47147176]]

H(X|Z)  analytic : 0.469 bits
H(X|Z)  decoder  : 0.471 bits





## 2 · CategoricalDecoder — N-ary Symmetric Channel

We simulate an N-ary symmetric channel:

- $Z$ is uniformly distributed over $\{0, 1, \dots, N-1\}$
- With probability $\varepsilon$, the output class $X$ is replaced with a random incorrect class

This models a noisy multi-class classification problem with uniform corruption.

The conditional entropy is:

$$
H(X \mid Z) =
-(1 - \varepsilon) \log_2(1 - \varepsilon)
- \varepsilon \log_2\left(\frac{\varepsilon}{N - 1}\right)
$$


In [4]:
# ─── Parameters ──────────────────────────────────────────────────────
samples = 10_000
N = 5                  # Number of classes
epsilon = 0.20         # Error probability

# ─── Theoretical entropy H(X|Z) for N-ary symmetric channel ──────────
H_theory = -(1 - epsilon) * np.log2(1 - epsilon) - epsilon * np.log2(epsilon / (N - 1))

# ─── Synthetic dataset generation ────────────────────────────────────
Z_np = np.random.randint(N, size=(samples, 1))     # True class
X_np = Z_np.copy()                                 # Initially identical

# Corrupt with probability epsilon
flip = np.random.rand(samples, 1) < epsilon
X_np[flip] = (Z_np[flip] + np.random.randint(1, N, size=flip.sum())) % N

# ─── Torch tensors + dataloader ─────────────────────────────────────
Z = torch.FloatTensor(Z_np)
X = torch.FloatTensor(X_np)
dataloader = DataLoader(TensorDataset(Z, X), batch_size=samples)

# ─── Decoder network ────────────────────────────────────────────────
decoder = decoders.CategoricalDecoder(
    nn.Sequential(
        nn.Linear(1, 8), nn.ReLU(),
        nn.Linear(8, 16), nn.ReLU(),
        nn.Linear(16, 8), nn.ReLU(),
    ),
    output_dim = 1,
    num_classes = N,
).to(device)

optimizer = torch.optim.Adam(decoder.parameters(), lr=3e-3)

# ─── Training ───────────────────────────────────────────────────────
decoder_results = train_decoder(
    model         = decoder,
    dataloader    = dataloader,
    optimizer     = optimizer,
    show_progress = True,
    device        = device,
    epochs        = 1000,
)

H_decoder = decoder_results['loss'][0]

print(f"H(X|Z)  analytic : {H_theory:.3f} bits")
print(f"H(X|Z)  decoder  : {H_decoder:.3f} bits")


100%|██████████| 1000/1000 [01:33<00:00, 10.75it/s, loss=[1.1356778]]

H(X|Z)  analytic : 1.122 bits
H(X|Z)  decoder  : 1.136 bits





## 3 · GaussianDecoder — Heteroscedastic Normal

We simulate a conditional Gaussian where the output variance depends on the input:

- $Z \sim \mathcal{N}(0, 1)$
- $X \mid Z = z \sim \mathcal{N}(0, z^2)$

This is a heteroscedastic model where the noise increases with the magnitude of $Z$.

The differential entropy is:

$$
H(X \mid Z)
= \tfrac{1}{2} \log_2(2\pi e)
- \tfrac{1}{2} \cdot \frac{\gamma + \log 2}{\log 2}
\approx 1.131 \ \text{bits}
$$

where $\gamma \approx 0.577$ is the Euler–Mascheroni constant.

In [2]:
# ─── Parameters ──────────────────────────────────────────────────────
samples = 10_000

# ─── Theoretical entropy H(X|Z) for N(0, Z^2) ────────────────────────
H_theory = (
    0.5 * np.log(2 * np.pi * np.e)           # ½ log(2πe)
    - 0.5 * (np.log(2) + np.euler_gamma)     # −½(γ + log 2)
) / np.log(2)

# ─── Synthetic dataset generation ────────────────────────────────────
Z_np = np.random.randn(samples, 1)
X_np = np.random.randn(samples, 1) * np.abs(Z_np)    # σ = |Z|

# ─── Torch tensors + dataloader ─────────────────────────────────────
Z = torch.FloatTensor(Z_np)
X = torch.FloatTensor(X_np)
dataloader = DataLoader(TensorDataset(Z, X), batch_size=samples)

# ─── Decoder network ────────────────────────────────────────────────
decoder = decoders.GaussianDecoder(
    nn.Sequential(
        nn.Linear(1, 8), nn.ReLU(),
        nn.Linear(8, 16), nn.ReLU(),
        nn.Linear(16, 8), nn.ReLU(),
    ),
    output_dim = 1,
).to(device)

optimizer = torch.optim.Adam(decoder.parameters(), lr=3e-3)

# ─── Training ───────────────────────────────────────────────────────
decoder_results = train_decoder(
    model         = decoder,
    dataloader    = dataloader,
    optimizer     = optimizer,
    show_progress = True,
    device        = device,
    epochs        = 1000,
)

H_decoder = decoder_results['loss'][0]

print(f"H(X|Z)  analytic : {H_theory:.3f} bits")
print(f"H(X|Z)  decoder  : {H_decoder:.3f} bits")


100%|██████████| 1000/1000 [01:23<00:00, 12.01it/s, loss=[1.167509]]

H(X|Z)  analytic : 1.131 bits
H(X|Z)  decoder  : 1.168 bits





## 4 · GaussianMixtureDecoder — Laplace Scale from Z

We simulate a conditional Laplace distribution where the scale depends on the input:

- $Z \sim \mathrm{Exp}(1)$
- $X \mid Z = z \sim \mathrm{Laplace}(0, z)$

This defines a heavy-tailed, heteroscedastic conditional distribution.

The differential entropy of $X \mid Z$ is:

$$
H(X \mid Z) =
\frac{1 + \log 2 - \gamma}{\log 2}
\approx 1.608\ \text{bits}
$$

where $\gamma$ is the Euler–Mascheroni constant.  
A Gaussian mixture with $K=5$ components should approximate the Laplace distribution sufficiently well.


In [5]:
# ─── Parameters ──────────────────────────────────────────────────────
samples = 10_000
K       = 5                  # Number of mixture components

# ─── Theoretical H(X|Z) for Laplace(0, Z) ────────────────────────────
H_theory = (1 + np.log(2) - np.euler_gamma) / np.log(2)

# ─── Synthetic dataset generation ────────────────────────────────────
Z_np = np.random.exponential(scale=1.0, size=(samples, 1))
X_np = np.random.laplace(loc=0.0, scale=Z_np)        # Laplace(scale = Z)

# ─── Torch tensors + dataloader ─────────────────────────────────────
Z = torch.FloatTensor(Z_np)
X = torch.FloatTensor(X_np)
dataloader = DataLoader(TensorDataset(Z, X), batch_size=samples)

# ─── Decoder network ────────────────────────────────────────────────
decoder = decoders.GaussianMixtureDecoder(
    nn.Sequential(
        nn.Linear(1, 8),  nn.ReLU(),
        nn.Linear(8, 16), nn.ReLU(),
        nn.Linear(16, 8), nn.ReLU(),
    ),
    output_dim     = 1,
    num_components = K,
).to(device)

optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-3)

# ─── Training ───────────────────────────────────────────────────────
decoder_results = train_decoder(
    model         = decoder,
    dataloader    = dataloader,
    optimizer     = optimizer,
    show_progress = True,
    device        = device,
    epochs        = 1000,
)

H_decoder = decoder_results['loss'][0]

print(f"H(X|Z)  analytic : {H_theory:.3f} bits")
print(f"H(X|Z)  decoder  : {H_decoder:.3f} bits")


100%|██████████| 1000/1000 [01:23<00:00, 12.03it/s, loss=[1.672348]]

H(X|Z)  analytic : 1.610 bits
H(X|Z)  decoder  : 1.672 bits



