In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
import struct

# Hardware Constraints per Specification
INPUT_SIZE = 784
HIDDEN_SIZE = 128
OUTPUT_SIZE = 10
NUM_BANKS = 64
MAX_DSP_OUTPUT = 2**47  # 48-bit accumulator
QTZ_INT8_MIN = -128
QTZ_INT8_MAX = 127
QTZ_UINT8_MIN = 0
QTZ_UINT8_MAX = 255

# Set seeds
torch.manual_seed(42)
np.random.seed(42)

print("Libraries loaded and hardware constants defined.")

Libraries loaded and hardware constants defined.


In [2]:
# Prepare data - We keep it simple (0-1 float) for training,
# but will map to 0-255 integers for the FPGA.
transform = transforms.Compose([
    transforms.ToTensor(), # Converts 0-255 image to 0.0-1.0 float
])

train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=False)

print("MNIST Data downloaded and loaded.")

100%|██████████| 9.91M/9.91M [00:00<00:00, 22.5MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 600kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 5.57MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.08MB/s]

MNIST Data downloaded and loaded.





In [3]:
class FPGA_MLP(nn.Module):
    def __init__(self):
        super(FPGA_MLP, self).__init__()
        # Layer 1: 784 -> 128. Bias=False to match DSP accumulator logic
        self.fc1 = nn.Linear(INPUT_SIZE, HIDDEN_SIZE, bias=False)
        self.relu = nn.ReLU()
        # Layer 2: 128 -> 10. Bias=False
        self.fc2 = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE, bias=False)

    def forward(self, x):
        x = x.view(-1, 784)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = FPGA_MLP()
print(model)

FPGA_MLP(
  (fc1): Linear(in_features=784, out_features=128, bias=False)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=10, bias=False)
)


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

def test():
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    acc = 100. * correct / len(test_loader.dataset)
    print(f'\nTest set: Accuracy: {correct}/{len(test_loader.dataset)} ({acc:.2f}%)\n')
    return acc

# Train for 3 epochs (usually enough for >92% on MNIST)
for epoch in range(1, 4):
    train(epoch)
    test()


Test set: Accuracy: 9446/10000 (94.46%)


Test set: Accuracy: 9632/10000 (96.32%)


Test set: Accuracy: 9680/10000 (96.80%)



In [5]:
# Observers to store min/max values
stats = {
    'input_min': 0, 'input_max': 0,
    'l1_out_min': 0, 'l1_out_max': 0,
    # Note: Layer 2 output is logits, we don't strictly need to quantize
    # the final output for Argmax, but we will for completeness.
}

model.eval()
with torch.no_grad():
    # Pass a batch of data to calibrate
    data_iter = iter(train_loader)
    images, _ = next(data_iter)
    images = images.to(device)

    # Input Stats (Should be 0.0 to 1.0)
    flat_img = images.view(-1, 784)
    stats['input_max'] = flat_img.max().item()

    # Layer 1 Output Stats (Before ReLU)
    l1_out = model.fc1(flat_img)
    stats['l1_out_max'] = l1_out.max().item()
    stats['l1_out_min'] = l1_out.min().item()

    print("Calibration Stats:", stats)

Calibration Stats: {'input_min': 0, 'input_max': 1.0, 'l1_out_min': -12.523642539978027, 'l1_out_max': 8.765558242797852}


In [9]:
import math

def get_shift_only_param(real_multiplier):
    """
    Approximates a float multiplier (e.g., 0.0034) using ONLY a bit shift.
    Mathematically: Finds N such that 2^(-N) ~= real_multiplier
    """
    if real_multiplier <= 0:
        return 0 # Should not happen with ReLU

    # We want: real_multiplier ~= 1.0 / (2^shift)
    # So: shift ~= -log2(real_multiplier)
    shift = round(-math.log2(real_multiplier))

    # Clamp shift to reasonable values (e.g., 0 to 31)
    shift = max(0, min(31, int(shift)))

    # Calculate the actual scale we ended up with
    actual_scale = 1.0 / (2**shift)
    error = abs(actual_scale - real_multiplier) / real_multiplier

    print(f"Target: {real_multiplier:.5f} | Shift: {shift} | Actual: {actual_scale:.5f} | Error: {error*100:.1f}%")
    return shift

# --- Recalculate Scales for Shift-Only ---

# 1. Effective Scale for Layer 1
M_effective_l1 = (S_input * S_w1) / S_l1_out
print("Layer 1 Param:")
shift_l1 = get_shift_only_param(M_effective_l1)

# 2. Effective Scale for Layer 2
# (Reusing previous S_logits calculation)
S_logits = 10.0 / 127.0
M_effective_l2 = (S_l1_out * S_w2) / S_logits
print("Layer 2 Param:")
shift_l2 = get_shift_only_param(M_effective_l2)

Layer 1 Param:
Target: 0.00067 | Shift: 11 | Actual: 0.00049 | Error: 27.2%
Layer 2 Param:
Target: 0.00346 | Shift: 8 | Actual: 0.00391 | Error: 12.8%


In [10]:
def fpga_layer_sim_shift_only(input_vec, weights, shift, activation='relu'):
    # 1. Matrix Vector Multiply (Accumulation in Int32/48)
    acc = np.matmul(weights.astype(np.int32), input_vec.astype(np.int32))

    # 2. ReLU
    if activation == 'relu':
        acc = np.maximum(acc, 0)

    # 3. PURE SHIFT Requantization
    # We perform a standard arithmetic right shift
    output_val = acc >> shift

    # 4. Saturation / Clipping to 0-255 (UInt8)
    output = np.clip(output_val, 0, 255).astype(np.uint8)

    return output

def run_fpga_inference_shift_only(image_tensor):
    img_uint8 = (image_tensor.view(-1).numpy() * 255).astype(np.uint8)

    # Layer 1
    l1_out = fpga_layer_sim_shift_only(img_uint8, W1_q, shift_l1, activation='relu')

    # Layer 2
    l2_out = fpga_layer_sim_shift_only(l1_out, W2_q, shift_l2, activation='none')

    return np.argmax(l2_out)

# Run Verification
correct = 0
total = 0
limit = 1000

print("Running Shift-Only Simulation...")
for i in range(limit):
    img, target = test_dataset[i]
    pred = run_fpga_inference_shift_only(img)
    if pred == target:
        correct += 1
    total += 1

print(f"FPGA (Shift-Only) Accuracy: {100.0 * correct / total:.2f}%")

Running Shift-Only Simulation...
FPGA (Shift-Only) Accuracy: 96.90%


In [15]:
print("--- MODEL EXPORT SUMMARY ---")

# 1. Quantized Weights
# Ensure they are explicitly int8
W1_final = W1_q.astype(np.int8)
W2_final = W2_q.astype(np.int8)

print(f"Layer 1 Weights: {W1_final.shape} | Min: {W1_final.min()} | Max: {W1_final.max()} | dtype: {W1_final.dtype}")
print(f"Layer 2 Weights: {W2_final.shape}  | Min: {W2_final.min()} | Max: {W2_final.max()} | dtype: {W2_final.dtype}")

# 2. Quantization Parameters (Shift Only)
# Ensure they are standard Python integers
shift_l1_final = int(shift_l1)
shift_l2_final = int(shift_l2)

print(f"\nLayer 1 Shift: {shift_l1_final}")
print(f"Layer 2 Shift: {shift_l2_final}")

# 3. Validation
if W1_final.shape != (128, 784):
    print("WARNING: W1 shape mismatch! Expected (128, 784)")
if W2_final.shape != (10, 128):
    print("WARNING: W2 shape mismatch! Expected (10, 128)")

--- MODEL EXPORT SUMMARY ---
Layer 1 Weights: (128, 784) | Min: -127 | Max: 67 | dtype: int8
Layer 2 Weights: (10, 128)  | Min: -127 | Max: 56 | dtype: int8

Layer 1 Shift: 11
Layer 2 Shift: 8


In [16]:
import numpy as np

# Save to a compressed .npz file
# This file contains everything needed to run the model on the FPGA
outfile = 'mnist_model.npz'

np.savez(
    outfile,
    # Weights (The Matrices)
    w1=W1_final,
    w2=W2_final,

    # Shifts (The Scalars)
    # We wrap them in numpy arrays because save/load works best with arrays
    shift_l1=np.array(shift_l1_final),
    shift_l2=np.array(shift_l2_final)
)

print(f"Successfully saved model to: {outfile}")
print("Keys in file: ['w1', 'w2', 'shift_l1', 'shift_l2']")

Successfully saved model to: mnist_model.npz
Keys in file: ['w1', 'w2', 'shift_l1', 'shift_l2']
