<a href="https://colab.research.google.com/github/AndreSlavescu/Intermediate-Gauss-Seidel-Decoding/blob/main/Intermediate_Gauss_Seidel_Decoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Wed Jan 24 00:36:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install pycuda

In [68]:
import numpy as np
import numpy.linalg as la
import random

import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from pycuda.driver import Event

# typing
from typing import List, Tuple

# Kernel code
"""
Idea:

The Gauss-Seidel Iteration Method, while inherently parallel for computing indices
of the vector x in Ax = b, it suffers from a sequential nature when computing n > 1
iterations. The idea with the below kernel is to perform a sort of "jump iteration",
where even indices of x are computed for even iterations and odd indices for odd iterations,
allowing for a two-fold parallelism in the convergence for finding the solution.
The implementation below along with the test for 100 iterations suggests that this ideology
may be effective, and can be applied to problems such as parallel token decoding in LLMs,
as seen in lookahead decoding (https://lmsys.org/blog/2023-11-21-lookahead-decoding/).
"""


kernel_code = '''
__global__ void jump_iteration_gauss_seidel(float *A, float *b, float *x_1, float *x_2, int size, int iterations) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index >= size) return;

    float *x_read, *x_write;

    if (index < size) {
        // initialize both buffers
        x_1[index] = 0.0;
        x_2[index] = 0.0;
        __syncthreads();

        #pragma unroll
        for (int iter = 0; iter < iterations; ++iter) {
            // Determine the read and write buffers
            if (iter % 2 == 0) {
                x_read = x_1;
                x_write = x_2;
            } else {
                x_read = x_2;
                x_write = x_1;
            }

            // jump iteration update logic
            bool even_iteration = (iter % 2 == 0);
            bool is_even_index = ((index / sqrt((float)size)) + ((index % (int)sqrt((float)size))) % 2 == 0);

            if (even_iteration == is_even_index) {
                float sum = 0.0;

                #pragma unroll
                for (int j = 0; j < size; ++j) {
                    if (j != index) {
                        sum += A[index * size + j] * x_read[j];
                    }
                }
                x_write[index] = (b[index] - sum) / A[index * size + index];
            } else {
                x_write[index] = x_read[index];
            }
            __syncthreads();
        }

        // assign latest values to x_1
        if (iterations % 2 != 0) {
            x_1[index] = x_2[index];
        }
    }
}
'''


# Compile the kernel code
mod = SourceModule(kernel_code)
jump_iteration_gauss_seidel = mod.get_function("jump_iteration_gauss_seidel")

def run_gauss_seidel_gpu(
    A: np.array,
    b: np.array,
    x: np.array,
    size: int,
    iterations: int
  ) -> bytes:
    A_gpu = cuda.mem_alloc(A.nbytes)
    b_gpu = cuda.mem_alloc(b.nbytes)
    x_gpu_1 = cuda.mem_alloc(x.nbytes)
    x_gpu_2 = cuda.mem_alloc(x.nbytes)

    # host to device copy
    cuda.memcpy_htod(A_gpu, A)
    cuda.memcpy_htod(b_gpu, b)
    cuda.memcpy_htod(x_gpu_1, x)

    block_size = 256
    grid_size = int(np.ceil(size / block_size))

    # Time kernel for 100 iterations
    start = cuda.Event()
    end = cuda.Event()
    start.record()

    jump_iteration_gauss_seidel(A_gpu, b_gpu, x_gpu_1, x_gpu_2, np.int32(size), np.int32(iterations),
                              block=(block_size, 1, 1), grid=(grid_size, 1))

    end.record()
    end.synchronize()

    # Calculate the elapsed time
    elapsed_time = start.time_till(end)
    print(f"Kernel execution time: {elapsed_time} milliseconds")

    # device to host copy
    cuda.memcpy_dtoh(x, x_gpu_1)
    return x

def generate_test_equation(size: int) -> Tuple[np.array, np.array]:
    """Random testcase for Ax = b"""
    # diagonally dominant matrix for convergence
    A = np.random.rand(size, size).astype(np.float32)
    for i in range(size):
        A[i, i] += size
    b = np.random.rand(size).astype(np.float32)
    return A, b

def test_gauss_seidel_gpu(
    test_cases: int = 20,
    size_range: Tuple[int, int] = (3, 100),
    iterations: int = 100
  ) -> bool:
    """Test the Intermediate Gauss-Seidel Decoding implementation with multiple random test cases."""
    for _ in range(test_cases):
        size = random.randint(*size_range)
        A, b = generate_test_equation(size)
        x0 = np.zeros_like(b)

        x_gpu = run_gauss_seidel_gpu(A, b, x0.copy(), len(b), iterations)
        x_real = la.solve(A, b)

        # Compare the results
        if not np.allclose(x_gpu, x_real, atol=1e-3):
            print(f"Test Failed for size {size}")
            print("GPU Result:", x_gpu)
            print("True Result:", x_real)
            print(f"Difference: {np.linalg.norm(x_gpu - x_real)}\n")
            return False
        else:
            print(f"Test Passed for size: {size}\n")

    print("\n\n#################\n\nAll tests passed! \n\n#################")
    return True

if __name__ == "__main__":
  # Run the test harness
  test_gauss_seidel_gpu()

Kernel execution time: 0.2595199942588806 milliseconds
Test Passed for size: 27

Kernel execution time: 0.6021119952201843 milliseconds
Test Passed for size: 78

Kernel execution time: 0.3318080008029938 milliseconds
Test Passed for size: 47

Kernel execution time: 0.30163198709487915 milliseconds
Test Passed for size: 41

Kernel execution time: 0.753600001335144 milliseconds
Test Passed for size: 92

Kernel execution time: 0.15772800147533417 milliseconds
Test Passed for size: 16

Kernel execution time: 0.30559998750686646 milliseconds
Test Passed for size: 44

Kernel execution time: 0.16291199624538422 milliseconds
Test Passed for size: 15

Kernel execution time: 0.3742719888687134 milliseconds
Test Passed for size: 56

Kernel execution time: 0.8007680177688599 milliseconds
Test Passed for size: 97

Kernel execution time: 0.2457599937915802 milliseconds
Test Passed for size: 31

Kernel execution time: 0.3624959886074066 milliseconds
Test Passed for size: 53

Kernel execution time: 0.