In [1]:
import numpy as np
from tqdm import tqdm
import math
import os
import matplotlib.pyplot as plt
import numba as nb
from numba import cuda


 ## 1. Lecture des fichiers MNIST

In [2]:
def make_uint32(byte_array):
    """ Recompose un entier 32 bits à partir de 4 octets de poids fort à poids faible """
    return ((byte_array[0] << 24) 
          | (byte_array[1] << 16) 
          | (byte_array[2] <<  8) 
          | (byte_array[3] <<  0))

def read_labels(filename):
    """ Lit un fichier de labels MNIST """
    with open(filename, 'rb') as f:
        _ = f.read(4)  # Magic number (non utilisé)
        n_bytes = f.read(4)
        n = make_uint32(n_bytes)
        labels = np.frombuffer(f.read(n), dtype=np.uint8)
    return labels

def read_images(filename):
    """ Lit un fichier d'images MNIST """
    with open(filename, 'rb') as f:
        _ = f.read(4)  # Magic number (non utilisé)
        n_bytes = f.read(4)
        n = make_uint32(n_bytes)
        row_bytes = f.read(4)
        col_bytes = f.read(4)
        rows = make_uint32(row_bytes)
        cols = make_uint32(col_bytes)
        images_raw = f.read(n * rows * cols)
        images = np.frombuffer(images_raw, dtype=np.uint8)
        images = images.reshape(n, rows * cols)
    return images


 ## 2. Fonctions utilitaires

In [3]:
def zero_to_n(n):
    """ Crée un tableau [0, 1, 2, ..., n-1] """
    return np.arange(n, dtype=np.uint32)

def shuffle(t, number_of_switch):
    """ Mélange un tableau t aléatoirement, en réalisant 'number_of_switch' échanges """
    size = len(t)
    for _ in range(number_of_switch):
        x = np.random.randint(0, size)
        y = np.random.randint(0, size)
        tmp = t[x]
        t[x] = t[y]
        t[y] = tmp
        
def init_sigma(nneurons_prev):
    return 1.0 / np.sqrt(nneurons_prev)

def sigmoid(x):
    """ Fonction d'activation vectorisée """
    return 1.0 / (1.0 + np.exp(-x))

def dsigmoid(x):
    """ Dérivée de sigmoid vectorisée """
    s = sigmoid(x)
    return s * (1.0 - s)


 ## 3. Fonctions Matricielles

In [4]:
def alloc_matrix(rows, columns):
    # En C, on fait un malloc puis un tableau 1D rows*columns.
    # En Python, on crée un np.ndarray de shape (rows, columns).
    return np.zeros((rows, columns), dtype=np.float64)


 ### 3.3. Version Parallèle

In [5]:
@cuda.jit
def matrix_dot_cuda(m1, m2, res, nrow1, ncol1, ncol2):
    row, col = cuda.grid(2)
    if row < nrow1 and col < ncol2:
        tmp = 0.0
        for k in range(ncol1):
            tmp += m1[row, k] * m2[k, col]
        res[row, col] = tmp

@cuda.jit
def matrix_sum_cuda(m1, m2, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m1[row, col] + m2[row, col]

@cuda.jit
def matrix_minus_cuda(m1, m2, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m1[row, col] - m2[row, col]

@cuda.jit
def hadamard_product_cuda(m1, m2, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m1[row, col] * m2[row, col]

@cuda.jit
def matrix_transpose_cuda(m, res, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[col, row] = m[row, col]

@cuda.jit
def matrix_scalar_cuda(m, res, scalar, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        res[row, col] = m[row, col] * scalar

@cuda.jit
def matrix_memcpy_cuda(dest, src, nrow, ncol):
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        dest[row, col] = src[row, col]
        
@cuda.jit
def matrix_function_id_cuda(m_in, m_out, nrow, ncol, func_id):
    """
    Applique la fonction correspondant à func_id :
      0 -> exp
      1 -> sigmoid
      2 -> dsigmoid
      ...
    """
    row, col = cuda.grid(2)
    if row < nrow and col < ncol:
        val = m_in[row, col]
        if func_id == 0:
            result = math.exp(val)
        elif func_id == 1:
            result = 1.0 / (1.0 + math.exp(-val))
        elif func_id == 2:
            s = 1.0 / (1.0 + math.exp(-val))
            result = s * (1.0 - s)
        else:
            result = val
        m_out[row, col] = result

def matrix_dot_parallel(m1, m2):
    nrow1, ncol1 = m1.shape; nrow2, ncol2 = m2.shape; 
    if ncol1 != nrow2: raise ValueError("Incompatible matrix dimensions")
    m1_gpu = cuda.to_device(m1)
    m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow1, ncol2), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(ncol2/16), math.ceil(nrow1/16))
    matrix_dot_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow1, ncol1, ncol2)
    return res_gpu.copy_to_host()

def matrix_sum_parallel(m1, m2):
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1); m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_sum_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_minus_parallel(m1, m2):
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1); m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_minus_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def hadamard_product_parallel(m1, m2):
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1); m2_gpu = cuda.to_device(m2)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    hadamard_product_cuda[blocks_per_grid, threads_per_block](m1_gpu, m2_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_transpose_parallel(m):
    nrow, ncol = m.shape
    m_gpu = cuda.to_device(m)
    res_gpu = cuda.device_array((ncol, nrow), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_transpose_cuda[blocks_per_grid, threads_per_block](m_gpu, res_gpu, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_scalar_parallel(m, s):
    nrow, ncol = m.shape
    m_gpu = cuda.to_device(m)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_scalar_cuda[blocks_per_grid, threads_per_block](m_gpu, res_gpu, s, nrow, ncol)
    return res_gpu.copy_to_host()

def matrix_memcpy_parallel(dest, src):
    nrow, ncol = src.shape
    dest_gpu = cuda.to_device(dest); src_gpu = cuda.to_device(src)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_memcpy_cuda[blocks_per_grid, threads_per_block](dest_gpu, src_gpu, nrow, ncol)
    dest[:] = dest_gpu.copy_to_host()

def matrix_function_parallel(m1, func_name):
    func_id = 0 if func_name == "exp" else \
              1 if func_name == "sigmoid" else \
              2 if func_name == "dsigmoid" else \
              None
    if func_id is None: raise ValueError(f"Unknown function ID : {func_name}")
    nrow, ncol = m1.shape
    m1_gpu = cuda.to_device(m1)
    res_gpu = cuda.device_array((nrow, ncol), dtype=np.float64)
    threads_per_block = (16,16)
    blocks_per_grid = (math.ceil(nrow/16), math.ceil(ncol/16))
    matrix_function_id_cuda[blocks_per_grid, threads_per_block](m1_gpu, res_gpu, nrow, ncol, func_id)
    return res_gpu.copy_to_host()


 ## 4. Réseau de Neurones

In [6]:
class Layer:
    def __init__(self, layer_number, number_of_neurons, nneurons_previous_layer, minibatch_size):
        self.number_of_neurons = number_of_neurons
        self.minibatch_size = minibatch_size
        self.activations = alloc_matrix(number_of_neurons, minibatch_size)
        self.z           = alloc_matrix(number_of_neurons, minibatch_size)
        self.delta       = alloc_matrix(number_of_neurons, minibatch_size)
        self.weights     = alloc_matrix(number_of_neurons, nneurons_previous_layer)
        self.biases      = alloc_matrix(number_of_neurons, 1)
        
        if layer_number > 0:
            self.init_weight(nneurons_previous_layer)
    
    def init_weight(self, nneurons_prev):
        sigma = init_sigma(nneurons_prev)
        r, c = self.weights.shape
        self.weights = np.random.normal(0.0, sigma, size=(r, c))

class ANN:
    def __init__(self, alpha, minibatch_size, number_of_layers, nneurons_per_layer):
        self.alpha = alpha
        self.minibatch_size = minibatch_size
        self.number_of_layers = number_of_layers
        self.layers = []
        for i in range(number_of_layers):
            if i == 0:
                self.layers.append(
                    Layer(i, nneurons_per_layer[i], 
                          nneurons_per_layer[i],  
                          minibatch_size)
                )
            else:
                self.layers.append(
                    Layer(i, nneurons_per_layer[i],
                          nneurons_per_layer[i-1],
                          minibatch_size)
                )


In [7]:
def set_input_parallel(nn, input_matrix):
    matrix_memcpy_parallel(nn.layers[0].activations, input_matrix)

def forward_parallel(nn):
    for l in range(1, nn.number_of_layers):
        layer_l = nn.layers[l]
        layer_prev = nn.layers[l-1]
        z1 = matrix_dot_parallel(layer_l.weights, layer_prev.activations)
        ones = np.ones((1, nn.minibatch_size), dtype=np.float64)
        z2 = matrix_dot_parallel(layer_l.biases, ones)
        layer_l.z = matrix_sum_parallel(z1, z2)
        layer_l.activations = matrix_function_parallel(layer_l.z, "sigmoid")

def backward_parallel(nn, y):
    L = nn.number_of_layers - 1
    layer_L = nn.layers[L]
    tmp = matrix_minus_parallel(layer_L.activations, y)
    dfzL = matrix_function_parallel(layer_L.z, "dsigmoid")
    layer_L.delta = hadamard_product_parallel(tmp, dfzL)
    for l in range(L, 1, -1):
        layer_l = nn.layers[l]
        layer_lm1 = nn.layers[l-1]
        w_l_transp = matrix_transpose_parallel(layer_l.weights)
        delta_tmp = matrix_dot_parallel(w_l_transp, layer_l.delta)
        dfz = matrix_function_parallel(layer_lm1.z, "dsigmoid")
        layer_lm1.delta = hadamard_product_parallel(delta_tmp, dfz)
    for l in range(1, nn.number_of_layers):
        layer_l = nn.layers[l]
        layer_lm1 = nn.layers[l-1]
        a_lm1_transp = matrix_transpose_parallel(layer_lm1.activations)
        w1 = matrix_dot_parallel(layer_l.delta, a_lm1_transp)
        w1 = matrix_scalar_parallel(w1, nn.alpha / nn.minibatch_size)
        layer_l.weights = matrix_minus_parallel(layer_l.weights, w1)
        ones = np.ones((nn.minibatch_size, 1), dtype=np.float64)
        b1 = matrix_dot_parallel(layer_l.delta, ones)
        b1 = matrix_scalar_parallel(b1, nn.alpha / nn.minibatch_size)
        layer_l.biases = matrix_minus_parallel(layer_l.biases, b1)


 ## 5. Fonctions d'entraînement (Version Parallèle)

In [8]:
def populate_parallel(x, y, minibatch_idx, train_img, train_label):
    """
    Remplit les matrices x et y avec le mini-batch.
    x -> shape (784, minibatch_size)
    y -> shape (10, minibatch_size)
    """
    x_batch = train_img[minibatch_idx].astype(np.float64) / 255.0
    matrix_memcpy_parallel(x, x_batch.T)  # Transpose pour avoir (784, batch_size)
    y.fill(0.0)
    indices = train_label[minibatch_idx]
    y[indices, np.arange(len(minibatch_idx))] = 1.0

def accuracy_parallel(nn, test_img, test_label, minibatch_size):
    """
    Compute the accuracy (%) on the test set using parallel matrix operations.
    """
    test_size = test_img.shape[0]
    nbatches = (test_size // minibatch_size) * minibatch_size
    correct = 0
    for i in range(0, nbatches, minibatch_size):
        batch_indices = np.arange(i, i + minibatch_size)
        x = test_img[batch_indices].T.astype(np.float64) / 255.0
        set_input_parallel(nn, x)
        forward_parallel(nn)
        preds = np.argmax(nn.layers[-1].activations, axis=0)
        correct += np.sum(preds == test_label[batch_indices])
    return (100.0 * correct) / nbatches

def cross_entropy_parallel(y_pred, y_true, eps=1e-12):
    """
    Compute cross-entropy error for a mini-batch using parallel operations.
    """
    y_pred = np.clip(y_pred, eps, 1.0 - eps)
    return -np.sum(y_true * np.log(y_pred)) / y_true.shape[1]


 ## 6. Lecture des données MNIST

In [9]:
DATA_PATH = "DATA"

train_img = read_images(os.path.join(DATA_PATH, "train-images.idx3-ubyte"))
train_label = read_labels(os.path.join(DATA_PATH, "train-labels.idx1-ubyte"))
test_img = read_images(os.path.join(DATA_PATH, "t10k-images.idx3-ubyte"))
test_label = read_labels(os.path.join(DATA_PATH, "t10k-labels.idx1-ubyte"))

train_size = train_img.shape[0]
test_size = test_img.shape[0]
print(f"Nombre de données d'entraînement : {train_size}")
print(f"Nombre de données de test : {test_size}")


Nombre de données d'entraînement : 60000
Nombre de données de test : 10000


### 6.4. Entraînement du réseau (Version Parallèle)

In [None]:
alpha = 0.05
minibatch_size = 16
number_of_layers = 3
nneurons_per_layer = [784, 30, 10]  # 28*28 = 784
nn = ANN(alpha, minibatch_size, number_of_layers, nneurons_per_layer)

shuffled_idx = zero_to_n(train_size)
x = alloc_matrix(784, minibatch_size)
y = alloc_matrix(10, minibatch_size)

NEPOCHS = 5

for epoch in range(NEPOCHS):
    shuffle(shuffled_idx, train_size)
    nbatches = (train_size // minibatch_size) * minibatch_size
    batch_iter = range(0, nbatches, minibatch_size)
    ce_total = 0.0
    n_train_batches = 0
    acc = accuracy_parallel(nn, test_img, test_label, minibatch_size)
    desc = f'Epoch {epoch} - Acc: {acc:.2f}%'
    for i in tqdm(batch_iter, desc=desc):
        batch_indices = shuffled_idx[i : i + minibatch_size]
        populate_parallel(x, y, batch_indices, train_img, train_label)
        set_input_parallel(nn, x)
        forward_parallel(nn)
        y_pred = nn.layers[-1].activations  
        ce_batch = cross_entropy_parallel(y_pred, y)
        ce_total += ce_batch
        n_train_batches += 1
        backward_parallel(nn, y)
    ce_mean = ce_total / n_train_batches
    acc = accuracy_parallel(nn, test_img, test_label, minibatch_size)
    print(f'Epoch {epoch} - Acc: {acc:.2f}%, CE: {ce_mean:.4f}')

acc_end = accuracy_parallel(nn, test_img, test_label, minibatch_size)
ce_end = cross_entropy_parallel(y_pred, y)
print("Final Model : Accuracy = {:.2f}%, Cross-Entropy Error = {:.4f}".format(acc_end, ce_end))


Epoch 0 - Acc: 10.40%: 100%|██████████| 3750/3750 [02:05<00:00, 29.84it/s]


Epoch 0 - Acc: 13.79%, CE: 2.2423


Epoch 1 - Acc: 14.53%: 100%|██████████| 3750/3750 [02:00<00:00, 31.14it/s]


Epoch 1 - Acc: 15.84%, CE: 2.3240


Epoch 2 - Acc: 16.67%: 100%|██████████| 3750/3750 [01:54<00:00, 32.82it/s]


Epoch 2 - Acc: 17.92%, CE: 2.2923


Epoch 3 - Acc: 18.88%: 100%|██████████| 3750/3750 [01:56<00:00, 32.23it/s]


Epoch 3 - Acc: 19.87%, CE: 2.2639


Epoch 4 - Acc: 20.26%:  63%|██████▎   | 2380/3750 [01:12<00:51, 26.41it/s]