In [None]:
!apt-get install iverilog
!pip install cocotb numpy pytest

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
iverilog is already the newest version (11.0-1.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
%%writefile accelerator_design.sv
`timescale 1ns / 1ps

module relu #(
    parameter WIDTH = 16
)(
    input  wire signed [WIDTH-1:0] in_val,
    output wire signed [WIDTH-1:0] out_val
);
    assign out_val = (in_val < 0) ? 0 : in_val;
endmodule

module top_accelerator(
    input  wire              clk,
    input  wire              rst_n,
    input  wire              start,
    input  wire signed [15:0] data_in,
    input  wire signed [15:0] weight_in,
    output wire              done,
    output wire signed [15:0] final_result
);
    wire signed [15:0] pe_out;
    wire pe_done;

    // --- UPGRADE: N_IN is set to 16 to support the largest layer ---
    serial_pe #(
        .IN_WIDTH(16),
        .ACC_WIDTH(48),
        .PROD_WIDTH(32),
        .N_IN(16)
    ) PE (
        .clk(clk), .rst_n(rst_n), .start(start),
        .input_in(data_in), .weight_in(weight_in),
        .done(pe_done), .out_q(pe_out), .acc_out()
    );

    assign done = pe_done;
    assign final_result = pe_out;
endmodule

module serial_pe #(
    parameter IN_WIDTH   = 16,
    parameter ACC_WIDTH  = 48,
    parameter PROD_WIDTH = 32,
    parameter N_IN       = 16
)(
    input  wire                         clk,
    input  wire                         rst_n,
    input  wire                         start,
    input  wire signed [IN_WIDTH-1:0] input_in,
    input  wire signed [IN_WIDTH-1:0] weight_in,
    output reg                          done,
    output reg signed [IN_WIDTH-1:0]  out_q,
    output reg signed [ACC_WIDTH-1:0] acc_out
);
    localparam integer SHIFT = 15;
    reg busy;
    // Counter must be big enough for 16 (needs 5 bits, not 4)
    reg [4:0] cnt;
    reg signed [ACC_WIDTH-1:0] acc;

    wire signed [PROD_WIDTH-1:0] prod_comb;
    assign prod_comb = $signed(input_in) * $signed(weight_in);

    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            busy    <= 1'b0;
            cnt     <= 0;
            acc     <= 0;
            out_q   <= 0;
            acc_out <= 0;
            done    <= 1'b0;
        end else begin
            done <= 1'b0;

            if (start && !busy) begin
                busy <= 1'b1;
                cnt  <= 0;
                acc  <= 0;
            end else if (busy) begin
                acc <= acc + {{(ACC_WIDTH-PROD_WIDTH){prod_comb[PROD_WIDTH-1]}}, prod_comb};
                if (cnt == N_IN-1) begin
                    reg signed [ACC_WIDTH-1:0] final_acc;
                    final_acc = acc + {{(ACC_WIDTH-PROD_WIDTH){prod_comb[PROD_WIDTH-1]}}, prod_comb};
                    acc_out <= final_acc;
                    out_q   <= final_acc >>> SHIFT;
                    done <= 1'b1;
                    busy <= 1'b0;
                    cnt  <= 0;
                end else begin
                    cnt <= cnt + 1;
                end
            end
        end
    end
endmodule

Overwriting accelerator_design.sv


In [None]:
%%writefile test_accelerator.py
import cocotb
from cocotb.triggers import RisingEdge, FallingEdge, Timer
from cocotb.clock import Clock
import numpy as np

# Q1.15 Utilities
SCALE = 32768.0
MAX_INT = 32767
MIN_INT = -32768
SHIFT = 15

def float_to_q1_15(val):
    if val >= 1.0: val = 0.999969
    if val <= -1.0: val = -1.0
    int_val = int(round(val * SCALE))
    if int_val > MAX_INT: int_val = MAX_INT
    if int_val < MIN_INT: int_val = MIN_INT
    return int_val

def golden_neuron_op(inputs, weights):
    acc_wide = 0
    # Python calculates math only on available inputs
    for i in range(len(inputs)):
        acc_wide += inputs[i] * weights[i]

    expected_q = acc_wide >> SHIFT
    expected_q = expected_q & 0xFFFF
    if expected_q > 32767: expected_q -= 65536
    return expected_q

# HARDWARE DRIVER (Supports Padding)
async def run_hardware_neuron(dut, inputs, weights):
    # Hardware has exactly 16 inputs.

    inputs_padded = inputs + [0] * (16 - len(inputs))
    weights_padded = weights + [0] * (16 - len(weights))

    # 1. Pulse Start
    await FallingEdge(dut.clk)
    dut.start.value = 1
    dut.data_in.value = 0
    dut.weight_in.value = 0
    await FallingEdge(dut.clk)
    dut.start.value = 0

    # 2. Feed Data (Always 16 Cycles now)
    for k in range(16):
        dut.data_in.value = inputs_padded[k]
        dut.weight_in.value = weights_padded[k]
        await FallingEdge(dut.clk)

    dut.data_in.value = 0
    dut.weight_in.value = 0

    # 3. Wait for Done
    while dut.done.value == 0:
        await RisingEdge(dut.clk)

    await RisingEdge(dut.clk)
    return dut.final_result.value.signed_integer

# MAIN TEST: 8-16-4 ARCHITECTURE
@cocotb.test()
async def test_8_16_4_network(dut):
    # Setup
    cocotb.start_soon(Clock(dut.clk, 10, units="ns").start())
    dut.rst_n.value = 0
    await Timer(50, units="ns")
    dut.rst_n.value = 1
    await RisingEdge(dut.clk)

    dut._log.info(" Starting 8-16-4 Network Verification ")
    total_calculations = 0
    correct_calculations = 0

    # 1. SETUP DATA

    # Layer 1: 8 Inputs -> 16 Neurons
    # Layer 2: 16 Inputs -> 4 Neurons

    raw_inputs = np.random.uniform(-0.9, 0.9, 8)
    input_vector = [float_to_q1_15(x) for x in raw_inputs]

    # Weights
    L1_weights = [[float_to_q1_15(np.random.uniform(-0.5, 0.5)) for _ in range(8)] for _ in range(16)]
    L2_weights = [[float_to_q1_15(np.random.uniform(-0.5, 0.5)) for _ in range(16)] for _ in range(4)]


    # 2. RUN LAYER 1 (16 Neurons)

    dut._log.info("\n Processing Layer 1 (16 Neurons) ")
    L1_activations = []

    for i in range(16):
        py_result = golden_neuron_op(input_vector, L1_weights[i])
        hdl_result = await run_hardware_neuron(dut, input_vector, L1_weights[i])

        total_calculations += 1
        if py_result == hdl_result:
            correct_calculations += 1
            dut._log.info(f"L1 Neuron #{i:02d}: MATCH (Py({py_result}) = HDL({hdl_result}))")
        else:
            dut._log.error(f"L1 Neuron #{i:02d}: MISMATCH (Py={py_result}, HDL={hdl_result})")

        # ReLU Activation
        activation = hdl_result if hdl_result > 0 else 0
        L1_activations.append(activation)


    # 3. RUN LAYER 2 (4 Neurons)

    dut._log.info("\n Processing Layer 2 (4 Neurons)")
    final_outputs = []

    for i in range(4):
        # Inputs are the 16 activations from Layer 1
        py_result = golden_neuron_op(L1_activations, L2_weights[i])
        hdl_result = await run_hardware_neuron(dut, L1_activations, L2_weights[i])

        total_calculations += 1
        if py_result == hdl_result:
            correct_calculations += 1
            dut._log.info(f"L2 Neuron #{i:02d}: MATCH ({hdl_result})")
        else:
            dut._log.error(f"L2 Neuron #{i:02d}: MISMATCH (Py={py_result}, HDL={hdl_result})")

        final_outputs.append(hdl_result)

    # FINAL REPORT
    accuracy = (correct_calculations / total_calculations) * 100.0

    dut._log.info("\n===========================================")
    dut._log.info(f" TOTAL NEURONS SIMULATED: {total_calculations}")
    dut._log.info(f" SUCCESSFUL MATCHES:      {correct_calculations}")
    dut._log.info("-------------------------------------------")
    dut._log.info(f" HARDWARE ACCURACY:       {accuracy:.2f}%")
    dut._log.info("===========================================")

    assert accuracy == 100.0, f"Accuracy was only {accuracy}%"

Overwriting test_accelerator.py


In [None]:
%%writefile Makefile

# Simulator
SIM ?= icarus
TOPLEVEL_LANG = verilog

# Files
VERILOG_SOURCES += $(PWD)/accelerator_design.sv

# Top Module Name (Must match your Verilog module name!)
TOPLEVEL = top_accelerator

# Python Module Name (Must match the filename in Step 3, without .py)
MODULE = test_accelerator

include $(shell cocotb-config --makefiles)/Makefile.sim


Overwriting Makefile


In [None]:
!COCOTB_ANSI_OUTPUT=1 make | grep -v "vpi_iterate"

rm -f results.xml
"make" -f Makefile results.xml
make[1]: Entering directory '/content'
/usr/local/lib/python3.12/dist-packages/cocotb_tools/makefiles/simulators/Makefile.icarus:66: Using MODULE is deprecated, please use COCOTB_TEST_MODULES instead.
rm -f results.xml
COCOTB_TEST_MODULES=test_accelerator COCOTB_TESTCASE= COCOTB_TEST_FILTER= COCOTB_TOPLEVEL=top_accelerator TOPLEVEL_LANG=verilog \
         /usr/bin/vvp -M /usr/local/lib/python3.12/dist-packages/cocotb/libs -m libcocotbvpi_icarus   sim_build/sim.vvp   
     -.--ns INFO     gpi                                ..mbed/gpi_embed.cpp:93   in _embed_init_python              Using Python 3.12.4 interpreter at /usr/bin/python3
     -.--ns INFO     gpi                                ../gpi/GpiCommon.cpp:79   in gpi_print_registered_impl       VPI registered
     0.00ns INFO    [49m[39m cocotb                             Running on Icarus Verilog version 11.0 (stable)[49m[39m
     0.00ns INFO    [49m[39m cocotb                 