In [1]:
import numpy as np
from tqdm import tqdm
import math
import os
import matplotlib.pyplot as plt
import numba as nb
from numba import cuda


 ## 1. Lecture des fichiers MNIST

In [2]:
def make_uint32(byte_array):
    """ Recompose un entier 32 bits à partir de 4 octets de poids fort à poids faible """
    return ((byte_array[0] << 24) 
          | (byte_array[1] << 16) 
          | (byte_array[2] <<  8) 
          | (byte_array[3] <<  0))

def read_labels(filename):
    """ Lit un fichier de labels MNIST """
    with open(filename, 'rb') as f:
        _ = f.read(4)  # Magic number (non utilisé)
        n_bytes = f.read(4)
        n = make_uint32(n_bytes)
        labels = np.frombuffer(f.read(n), dtype=np.uint8)
    return labels

def read_images(filename):
    """ Lit un fichier d'images MNIST """
    with open(filename, 'rb') as f:
        _ = f.read(4)  # Magic number (non utilisé)
        n_bytes = f.read(4)
        n = make_uint32(n_bytes)
        row_bytes = f.read(4)
        col_bytes = f.read(4)
        rows = make_uint32(row_bytes)
        cols = make_uint32(col_bytes)
        images_raw = f.read(n * rows * cols)
        images = np.frombuffer(images_raw, dtype=np.uint8)
        images = images.reshape(n, rows * cols)
    return images


 ## 2. Fonctions utilitaires

In [3]:
def zero_to_n(n):
    """ Crée un tableau [0, 1, 2, ..., n-1] """
    return np.arange(n, dtype=np.uint32)

def shuffle(t, number_of_switch):
    """ Mélange un tableau t aléatoirement, en réalisant 'number_of_switch' échanges """
    size = len(t)
    for _ in range(number_of_switch):
        x = np.random.randint(0, size)
        y = np.random.randint(0, size)
        tmp = t[x]
        t[x] = t[y]
        t[y] = tmp
        
def init_sigma(nneurons_prev):
    return 1.0 / np.sqrt(nneurons_prev)

def sigmoid(x):
    """ Fonction d'activation vectorisée """
    return 1.0 / (1.0 + np.exp(-x))

def dsigmoid(x):
    """ Dérivée de sigmoid vectorisée """
    s = sigmoid(x)
    return s * (1.0 - s)


 ## 3. Fonctions Matricielles

In [4]:
def alloc_matrix(rows, columns):
    # En C, on fait un malloc puis un tableau 1D rows*columns.
    # En Python, on crée un np.ndarray de shape (rows, columns).
    return np.zeros((rows, columns), dtype=np.float64)


 ### 3.3. Version Parallèle

In [5]:
@cuda.jit
def matrix_dot_cuda(m1, m2, res, nrow1, ncol1, ncol2):
    row, col = cuda.grid(2)
    if row < nrow1 and col < ncol2:
        tmp = 0.0
        for k in range(ncol1):
            tmp += m1[row, k] * m2[k, col]
        res[row, col] = tmp

@cuda.jit
def matrix_sum_cuda(m1, m2, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m1[row, col] + m2[row, col]

@cuda.jit
def matrix_minus_cuda(m1, m2, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m1[row, col] - m2[row, col]

@cuda.jit
def hadamard_product_cuda(m1, m2, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m1[row, col] * m2[row, col]

@cuda.jit
def matrix_transpose_cuda(m, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[col, row] = m[row, col]

@cuda.jit
def matrix_scalar_cuda(m, res, scalar, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m[row, col] * scalar

@cuda.jit
def matrix_memcpy_cuda(dest, src, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        dest[row, col] = src[row, col]
        
@cuda.jit
def matrix_function_id_cuda(m_in, m_out, nrow, ncol, func_id):
    """
    Applique la fonction correspondant à func_id :
      0 -> exp
      1 -> sigmoid
      2 -> dsigmoid
      ...
    """
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        val = m_in[row, col]
        if func_id == 0:
            result = math.exp(val)
        elif func_id == 1:
            result = 1.0 / (1.0 + math.exp(-val))
        elif func_id == 2:
            s = 1.0 / (1.0 + math.exp(-val))
            result = s * (1.0 - s)
        else:
            result = val
        m_out[row, col] = result

def matrix_dot_parallel(m1, m2):
    nrow1, ncol1 = m1.shape; nrow2, ncol2 = m2.shape; 
    if ncol1 != nrow2: raise ValueError("Incompatible matrix dimensions")
    m1_gpu = cuda.to_device(m1)
    m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow1, ncol2), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(ncol2/16), math.ceil(nrow1/16))
    matrix_dot_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow1, ncol1, ncol2)
    return res_gpu.copy_to_host()

def matrix_sum_parallel(m1, m2):
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1); m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_sum_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_minus_parallel(m1, m2):
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1); m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_minus_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def hadamard_product_parallel(m1, m2):
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1); m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    hadamard_product_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_transpose_parallel(m):
    nrow, ncol = m.shape
    m_gpu = cuda.to_device(m)
    res_gpu = cuda.device_array((ncol, nrow), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_transpose_cuda[blocks_per_grid, threads_per_block](m_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_scalar_parallel(m, s):
    nrow, ncol = m.shape
    m_gpu = cuda.to_device(m)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_scalar_cuda[blocks_per_grid, threads_per_block](m_gpu, res_gpu, s, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_memcpy_parallel(dest, src):
    nrow, ncol = src.shape
    dest_gpu = cuda.to_device(dest); src_gpu = cuda.to_device(src)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_memcpy_cuda[blocks_per_grid, threads_per_block](dest_gpu, src_gpu, nrow, ncol)
    dest[:] = dest_gpu.copy_to_host()

def matrix_function_parallel(m1, func_name):
    func_id = 0 if func_name == "exp" else \
              1 if func_name == "sigmoid" else \
              2 if func_name == "dsigmoid" else \
              None
    if func_id is None: raise ValueError(f"Unknown function ID : {func_name}")
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_function_id_cuda[blocks_per_grid, threads_per_block](m1_gpu, res_gpu, nrow, ncol, func_id)
    return res_gpu.copy_to_host()


 ## 4. Réseau de Neurones

In [6]:
class Layer:
    def __init__(self, layer_number, number_of_neurons, nneurons_previous_layer, minibatch_size):
        self.number_of_neurons = number_of_neurons
        self.minibatch_size = minibatch_size
        self.activations = alloc_matrix(number_of_neurons, minibatch_size)
        self.z           = alloc_matrix(number_of_neurons, minibatch_size)
        self.delta       = alloc_matrix(number_of_neurons, minibatch_size)
        self.weights     = alloc_matrix(number_of_neurons, nneurons_previous_layer)
        self.biases      = alloc_matrix(number_of_neurons, 1)
        
        if layer_number > 0:
            self.init_weight(nneurons_previous_layer)
    
    def init_weight(self, nneurons_prev):
        sigma = init_sigma(nneurons_prev)
        r, c = self.weights.shape
        self.weights = np.random.normal(0.0, sigma, size=(r, c))

class ANN:
    def __init__(self, alpha, minibatch_size, number_of_layers, nneurons_per_layer):
        self.alpha = alpha
        self.minibatch_size = minibatch_size
        self.number_of_layers = number_of_layers
        self.layers = []
        for i in range(number_of_layers):
            if i == 0:
                self.layers.append(
                    Layer(i, nneurons_per_layer[i], 
                          nneurons_per_layer[i],  
                          minibatch_size)
                )
            else:
                self.layers.append(
                    Layer(i, nneurons_per_layer[i],
                          nneurons_per_layer[i-1],
                          minibatch_size)
                )


In [7]:
def set_input_parallel(nn, input_matrix):
    matrix_memcpy_parallel(nn.layers[0].activations, input_matrix)

def forward_parallel(nn):
    for l in range(1, nn.number_of_layers):
        layer_l = nn.layers[l]
        layer_prev = nn.layers[l-1]
        z1 = matrix_dot_parallel(layer_l.weights, layer_prev.activations)
        ones = np.ones((1, nn.minibatch_size), dtype=np.float64)
        z2 = matrix_dot_parallel(layer_l.biases, ones)
        layer_l.z = matrix_sum_parallel(z1, z2)
        layer_l.activations = matrix_function_parallel(layer_l.z, "sigmoid")

def backward_parallel(nn, y):
    L = nn.number_of_layers - 1
    layer_L = nn.layers[L]
    tmp = matrix_minus_parallel(layer_L.activations, y)
    dfzL = matrix_function_parallel(layer_L.z, "dsigmoid")
    layer_L.delta = hadamard_product_parallel(tmp, dfzL)
    for l in range(L, 1, -1):
        layer_l = nn.layers[l]
        layer_lm1 = nn.layers[l-1]
        w_l_transp = matrix_transpose_parallel(layer_l.weights)
        delta_tmp = matrix_dot_parallel(w_l_transp, layer_l.delta)
        dfz = matrix_function_parallel(layer_lm1.z, "dsigmoid")
        layer_lm1.delta = hadamard_product_parallel(delta_tmp, dfz)
    for l in range(1, nn.number_of_layers):
        layer_l = nn.layers[l]
        layer_lm1 = nn.layers[l-1]
        a_lm1_transp = matrix_transpose_parallel(layer_lm1.activations)
        w1 = matrix_dot_parallel(layer_l.delta, a_lm1_transp)
        w1 = matrix_scalar_parallel(w1, nn.alpha / nn.minibatch_size)
        layer_l.weights = matrix_minus_parallel(layer_l.weights, w1)
        ones = np.ones((nn.minibatch_size, 1), dtype=np.float64)
        b1 = matrix_dot_parallel(layer_l.delta, ones)
        b1 = matrix_scalar_parallel(b1, nn.alpha / nn.minibatch_size)
        layer_l.biases = matrix_minus_parallel(layer_l.biases, b1)


 ## 5. Fonctions d'entraînement (Version Parallèle)

In [8]:
def populate_parallel(x, y, minibatch_idx, train_img, train_label):
    """
    Remplit les matrices x et y avec le mini-batch.
    x -> shape (784, minibatch_size)
    y -> shape (10, minibatch_size)
    """
    x_batch = train_img[minibatch_idx].astype(np.float64) / 255.0
    matrix_memcpy_parallel(x, x_batch.T)  # Transpose pour avoir (784, batch_size)
    y.fill(0.0)
    indices = train_label[minibatch_idx]
    y[indices, np.arange(len(minibatch_idx))] = 1.0

def accuracy_parallel(nn, test_img, test_label, minibatch_size):
    """
    Compute the accuracy (%) on the test set using parallel matrix operations.
    """
    test_size = test_img.shape[0]
    nbatches = (test_size // minibatch_size) * minibatch_size
    correct = 0
    for i in range(0, nbatches, minibatch_size):
        batch_indices = np.arange(i, i + minibatch_size)
        x = test_img[batch_indices].T.astype(np.float64) / 255.0
        set_input_parallel(nn, x)
        forward_parallel(nn)
        preds = np.argmax(nn.layers[-1].activations, axis=0)
        correct += np.sum(preds == test_label[batch_indices])
    return (100.0 * correct) / nbatches

def cross_entropy_parallel(y_pred, y_true, eps=1e-12):
    """
    Compute cross-entropy error for a mini-batch using parallel operations.
    """
    y_pred = np.clip(y_pred, eps, 1.0 - eps)
    return -np.sum(y_true * np.log(y_pred)) / y_true.shape[1]


### premiere amelioration

In [9]:
def forward_parallel_v2(nn):
    """
    We assume nn.layers[0].activations is already set (by set_input_parallel).
    This function simply propagates forward through all layers.
    """
    for l in range(1, nn.number_of_layers):
        layer_l    = nn.layers[l]
        layer_prev = nn.layers[l-1]
        # Z1 = W * A_{l-1}
        Z1 = matrix_dot_parallel(layer_l.weights, layer_prev.activations)
        # Z2 = bias repeated across columns
        ones = np.ones((1, Z1.shape[1]), dtype=np.float64)
        Z2   = matrix_dot_parallel(layer_l.biases, ones)
        # Z = Z1 + Z2
        layer_l.z = matrix_sum_parallel(Z1, Z2)
        # A_l = sigmoid(Z)
        layer_l.activations = matrix_function_parallel(layer_l.z, "sigmoid")
    # final activations are now in nn.layers[-1].activations

def accuracy_parallel_v2(nn, test_img, test_label, minibatch_size, n_sub_batches=4):
    """
    Compute accuracy in parallel, but group multiple mini‐batches
    into one big batch for better GPU usage.
    """
    test_size = test_img.shape[0]
    nbatches = (test_size // minibatch_size) * minibatch_size
    correct = 0
    for i in range(0, nbatches, minibatch_size * n_sub_batches):
        # how many sub‐batches can we handle this round?
        actual_subbatches = min(n_sub_batches, (nbatches - i) // minibatch_size)
        if actual_subbatches == 0:
            break
        total_cols = minibatch_size * actual_subbatches
        big_data   = np.zeros((nn.layers[0].number_of_neurons, total_cols), dtype=np.float64)
        big_labels = np.zeros(total_cols, dtype=test_label.dtype)
        # pack sub‐batches into 'big_data'
        for sb in range(actual_subbatches):
            start_idx = i + sb * minibatch_size
            end_idx   = start_idx + minibatch_size
            batch_indices = np.arange(start_idx, end_idx)
            x_sub = test_img[batch_indices].T.astype(np.float64) / 255.0
            col_start = sb * minibatch_size
            col_end   = col_start + minibatch_size
            big_data[:, col_start:col_end]  = x_sub
            big_labels[col_start:col_end]   = test_label[batch_indices]
        # Forward pass over big_data (just set it to layer 0, then forward)
        nn.layers[0].activations = big_data
        forward_parallel_v2(nn)
        # predictions for all sub‐batches at once
        preds = np.argmax(nn.layers[-1].activations, axis=0)
        correct += np.sum(preds == big_labels)
    return (100.0 * correct) / nbatches

### deuxieme amelioration

In [10]:
def guess_minibatch_size(minimum=16, maximum=1024):
    """
    Naive heuristic: pick a batch size that's a power of 2 between 'minimum' and 'maximum'
    so that we can spawn enough threads to fill the GPU. 
    This doesn't do advanced memory checks. It's purely a simple guess based on SM count.
    """
    device = cuda.get_current_device()
    mp_count = device.MULTIPROCESSOR_COUNT      # Number of SMs
    warp_size = device.WARP_SIZE               # Usually 32
    # Suppose we want to ensure at least (mp_count * some_factor) threads 
    # are in flight per kernel. We'll pick e.g. 8 warps per SM => 8 * warp_size = 256 threads each SM
    desired_threads_per_sm = 8 * warp_size      # 8 warps per SM
    total_desired_threads = mp_count * desired_threads_per_sm
    
    # Now we guess a batch_size that, when multiplied by (some dimension) 
    # or some expected usage, doesn't trivially under‐utilize the GPU.
    # For instance, if the main dimension is 784, we might want batch_size * 784 ~ total_desired_threads
    # We'll keep it simple: batch_size >= total_desired_threads / 784 
    approx_needed = math.ceil(total_desired_threads / 784)
    
    # Round that up to the nearest power of 2
    # but clamp it between 'minimum' and 'maximum'
    candidate = 1
    while candidate < approx_needed:
        candidate <<= 1  # candidate *= 2
    if candidate < minimum:
        candidate = minimum
    if candidate > maximum:
        candidate = maximum
    return candidate

In [11]:
def guess_n_sub_batches(nn, test_size, default=4, max_sub=16):
    """
    Naive heuristic to pick a sub-batch count for accuracy checks.
    We want: sub-batches <= max_sub, and (sub-batches * minibatch_size) <= test_size
    We'll also consider the GPU SM count to see if it's large or small.
    """
    device = cuda.get_current_device()
    mp_count = device.MULTIPROCESSOR_COUNT  # how many SMs
    # If we have a lot of SMs, we might want more sub-batches to keep the GPU busier.
    # Let's do a simple approach: if mp_count >= 20, we'll try 8 by default, else 4.
    suggested = 8 if mp_count >= 20 else default

    # But clamp it so that sub-batches * minibatch_size isn't bigger than test_size
    max_possible = test_size // nn.minibatch_size
    final_sub = min(suggested, max_possible, max_sub)
    if final_sub < 1:
        final_sub = 1

    print(f"Guessed n_sub_batches = {final_sub} based on GPU SMs={mp_count} and test_size={test_size}")
    return final_sub

### troisieme amelioration

In [12]:
@cuda.jit
def matrix_dot_bias_activation_cuda(W, A, B, out, nrowW, ncolW, ncolA):
    """
    out[row,col] = sigmoid( sum_{k in 0..(ncolW-1)} W[row,k]*A[k,col] + B[row,0] )
    """
    row, col = cuda.grid(2)
    if row < nrowW and col < ncolA:
        # do the dot
        tmp = 0.0
        for k in range(ncolW):
            tmp += W[row, k] * A[k, col]
        # add bias
        tmp += B[row, 0]
        # apply sigmoid
        val = 1.0 / (1.0 + math.exp(-tmp))
        out[row, col] = val

def matrix_dot_bias_activation_parallel(W, A, B):
    """
    Single GPU kernel that does:
       out[row,col] = sigmoid( W[row,:] dot A[:,col] + B[row,0] )
    Returns out with shape (W.shape[0], A.shape[1]).
    """
    nrowW, ncolW = W.shape  # W is shape (nrowW, ncolW)
    nrowA, ncolA = A.shape  # A is shape (nrowA, ncolA)
    if ncolW != nrowA:
        raise ValueError("Incompatible shapes for matrix-dot-bias-activation!")
    nrowB, ncolB = B.shape
    if not (nrowB == nrowW and ncolB == 1):
        raise ValueError("Bias shape mismatch! B should be (nrowW, 1)")

    # allocate output
    out_gpu = cuda.device_array((nrowW, ncolA), dtype=np.float64)

    # copy inputs to GPU
    W_gpu = cuda.to_device(W)
    A_gpu = cuda.to_device(A)
    B_gpu = cuda.to_device(B)

    threads_per_block = (16, 16)
    blocks_per_grid = (math.ceil(ncolA / 16), math.ceil(nrowW / 16))

    matrix_dot_bias_activation_cuda[blocks_per_grid, threads_per_block](
        W_gpu, A_gpu, B_gpu, out_gpu, nrowW, ncolW, ncolA
    )

    return out_gpu.copy_to_host()

def forward_parallel_v3(nn):
    """
    New forward pass that merges (dot + bias + sigmoid) 
    into one GPU kernel for each layer.
    """
    for l in range(1, nn.number_of_layers):
        layer_l    = nn.layers[l]
        layer_prev = nn.layers[l-1]
        # single call merges the 3 steps:
        layer_l.activations = matrix_dot_bias_activation_parallel(
            layer_l.weights, 
            layer_prev.activations,
            layer_l.biases
        )
        # we don't need separate layer_l.z if we want just the final activation;
        # but if you want to keep z for backprop, you can store it as well:
        layer_l.z = layer_l.activations.copy()


def accuracy_parallel_v3(nn, test_img, test_label, minibatch_size, n_sub_batches=4):
    """
    Compute accuracy in parallel, but group multiple mini‐batches
    into one big batch for better GPU usage.
    """
    test_size = test_img.shape[0]
    nbatches = (test_size // minibatch_size) * minibatch_size
    correct = 0
    for i in range(0, nbatches, minibatch_size * n_sub_batches):
        # how many sub‐batches can we handle this round?
        actual_subbatches = min(n_sub_batches, (nbatches - i) // minibatch_size)
        if actual_subbatches == 0:
            break
        total_cols = minibatch_size * actual_subbatches
        big_data   = np.zeros((nn.layers[0].number_of_neurons, total_cols), dtype=np.float64)
        big_labels = np.zeros(total_cols, dtype=test_label.dtype)
        # pack sub‐batches into 'big_data'
        for sb in range(actual_subbatches):
            start_idx = i + sb * minibatch_size
            end_idx   = start_idx + minibatch_size
            batch_indices = np.arange(start_idx, end_idx)
            x_sub = test_img[batch_indices].T.astype(np.float64) / 255.0
            col_start = sb * minibatch_size
            col_end   = col_start + minibatch_size
            big_data[:, col_start:col_end]  = x_sub
            big_labels[col_start:col_end]   = test_label[batch_indices]
        # Forward pass over big_data (just set it to layer 0, then forward)
        nn.layers[0].activations = big_data
        forward_parallel_v3(nn)
        # predictions for all sub‐batches at once
        preds = np.argmax(nn.layers[-1].activations, axis=0)
        correct += np.sum(preds == big_labels)
    return (100.0 * correct) / nbatches

 ## 6. Lecture des données MNIST

In [13]:
DATA_PATH = "DATA"

train_img = read_images(os.path.join(DATA_PATH, "train-images.idx3-ubyte"))
train_label = read_labels(os.path.join(DATA_PATH, "train-labels.idx1-ubyte"))
test_img = read_images(os.path.join(DATA_PATH, "t10k-images.idx3-ubyte"))
test_label = read_labels(os.path.join(DATA_PATH, "t10k-labels.idx1-ubyte"))

train_size = train_img.shape[0]
test_size = test_img.shape[0]
print(f"Nombre de données d'entraînement : {train_size}")
print(f"Nombre de données de test : {test_size}")


Nombre de données d'entraînement : 60000
Nombre de données de test : 10000


### 6.4. Entraînement du réseau (Version Parallèle)

In [14]:
alpha = 0.05
minibatch_size = guess_minibatch_size(minimum=16, maximum=1024)
print("Guessed minibatch_size =", minibatch_size)

number_of_layers = 3
nneurons_per_layer = [784, 30, 10]  # 28*28 = 784
nn = ANN(alpha, minibatch_size, number_of_layers, nneurons_per_layer)

shuffled_idx = zero_to_n(train_size)
x = alloc_matrix(784, minibatch_size)
y = alloc_matrix(10, minibatch_size)

Guessed minibatch_size = 16


In [15]:
NEPOCHS = 5
for epoch in range(NEPOCHS):
    shuffle(shuffled_idx, train_size)
    nbatches = (train_size // minibatch_size) * minibatch_size
    batch_iter = range(0, nbatches, minibatch_size)
    ce_total = 0.0
    n_train_batches = 0
    acc = accuracy_parallel_v3(nn, test_img, test_label, minibatch_size)
    desc = f'Epoch {epoch} - Acc: {acc:.2f}%'
    for i in tqdm(batch_iter, desc=desc):
        batch_indices = shuffled_idx[i : i + minibatch_size]
        populate_parallel(x, y, batch_indices, train_img, train_label)
        set_input_parallel(nn, x)
        forward_parallel_v3(nn)
        y_pred = nn.layers[-1].activations
        ce_batch = cross_entropy_parallel(y_pred, y)
        ce_total += ce_batch
        n_train_batches += 1
        backward_parallel(nn, y)
    ce_mean = ce_total / n_train_batches
    acc = accuracy_parallel_v3(nn, test_img, test_label, minibatch_size)
    print(f'Epoch {epoch} - Acc: {acc:.2f}%, CE: {ce_mean:.4f}')

acc_end = accuracy_parallel_v3(nn, test_img, test_label, minibatch_size)
ce_end = cross_entropy_parallel(y_pred, y)
print("Final Model : Accuracy = {:.2f}%, Cross-Entropy Error = {:.4f}".format(acc_end, ce_end))

Epoch 0 - Acc: 9.53%: 100%|██████████| 3750/3750 [02:36<00:00, 23.90it/s]


Epoch 0 - Acc: 10.93%, CE: 2.2274


Epoch 1 - Acc: 10.89%: 100%|██████████| 3750/3750 [02:12<00:00, 28.26it/s]


Epoch 1 - Acc: 12.05%, CE: 2.2343


Epoch 2 - Acc: 12.07%: 100%|██████████| 3750/3750 [02:37<00:00, 23.83it/s]


Epoch 2 - Acc: 12.07%, CE: 3.7070


Epoch 3 - Acc: 11.92%: 100%|██████████| 3750/3750 [02:35<00:00, 24.05it/s]


Epoch 3 - Acc: 13.19%, CE: 2.1315


Epoch 4 - Acc: 13.07%: 100%|██████████| 3750/3750 [02:23<00:00, 26.13it/s]


Epoch 4 - Acc: 14.27%, CE: 2.1158
Final Model : Accuracy = 14.34%, Cross-Entropy Error = 2.1599
