In [1]:
import cupy as cp
import numpy as np
import time
import random

# Define baseline matrix multiplication using NumPy
def baseline_matmul(A, B):
    return np.dot(A, B)

# CUDA Kernel Templates
CUDA_KERNEL_TEMPLATES = [
    """
    extern "C" __global__
    void matmul_kernel(float *A, float *B, float *C, int N) {
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;

        if(row < N && col < N) {
            float sum = 0.0;
            for(int k = 0; k < N; k++) {
                sum += A[row * N + k] * B[k * N + col];
            }
            C[row * N + col] = sum;
        }
    }
    """,
    """
    extern "C" __global__
    void matmul_kernel(float *A, float *B, float *C, int N) {
        __shared__ float As[16][16];
        __shared__ float Bs[16][16];

        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        float sum = 0.0;

        for (int tile = 0; tile < N / 16; ++tile) {
            if (row < N && (tile * 16 + threadIdx.x) < N) {
                As[threadIdx.y][threadIdx.x] = A[row * N + (tile * 16 + threadIdx.x)];
            } else {
                As[threadIdx.y][threadIdx.x] = 0.0f;
            }
            if ((tile * 16 + threadIdx.y) < N && col < N) {
                Bs[threadIdx.y][threadIdx.x] = B[(tile * 16 + threadIdx.y) * N + col];
            } else {
                Bs[threadIdx.y][threadIdx.x] = 0.0f;
            }
            __syncthreads();

            for (int k = 0; k < 16; ++k)
                sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
            __syncthreads();
        }
        if (row < N && col < N) {
            C[row * N + col] = sum;
        }
    }
    """
]

# Compile CUDA kernels
def compile_cuda_kernel(kernel_code):
    try:
        return cp.RawKernel(kernel_code, "matmul_kernel")
    except Exception as e:
        print(f"CUDA Compilation Error: {e}")
        return None

# Swarm Optimization
class SwarmCUDAOptimizer:
    def __init__(self, N=64, num_agents=5, max_iters=10):
        self.N = N  # Matrix size
        self.num_agents = num_agents
        self.max_iters = max_iters
        self.knowledge_archive = []

    def generate_random_kernel(self):
        # Select a random kernel template and modify block sizes
        base_kernel = random.choice(CUDA_KERNEL_TEMPLATES)
        block_size = random.choice([8, 16, 32])
        modified_kernel = base_kernel.replace("16", str(block_size))  # Adjust shared memory sizes
        return modified_kernel, block_size

    def evaluate_kernel(self, kernel_code, block_size, A, B):
        try:
            kernel = compile_cuda_kernel(kernel_code)
            if kernel is None:
                return float('inf')  # Compilation failure

            C = cp.zeros((self.N, self.N), dtype=cp.float32)
            d_A = cp.asarray(A)
            d_B = cp.asarray(B)
            d_C = cp.asarray(C)
            grid_size = ( (self.N + block_size - 1) // block_size, (self.N + block_size - 1) // block_size)  # Correct grid size
            block_size_tuple = (block_size, block_size)  # Use a tuple for block size

            start = time.time()
            kernel(grid_size, block_size_tuple, (d_A, d_B, d_C, self.N))
            cp.cuda.Device(0).synchronize()
            end = time.time()

            execution_time = end - start
            print(f"Kernel Execution Time: {execution_time:.6f}s | Block Size: {block_size} | Grid Size: {grid_size}")
            return execution_time  # Ensure only a single float value is returned

        except Exception as e:
            print(f"CUDA Execution Error: {e}")
            return float('inf')  # If kernel fails, assign worst time


    def optimize(self):
        A = np.random.rand(self.N, self.N).astype(np.float32)
        B = np.random.rand(self.N, self.N).astype(np.float32)

        best_time = float('inf')
        best_kernel = None

        for iter in range(self.max_iters):
            print(f"Iteration {iter+1}/{self.max_iters}")
            candidates = []

            for agent in range(self.num_agents):
                kernel_code, block_size = self.generate_random_kernel()
                exec_time = self.evaluate_kernel(kernel_code, block_size, A, B)

                # No need to check type, evaluate_kernel handles errors
                if exec_time < best_time:
                    best_time = exec_time
                    best_kernel = kernel_code
                    print(f"New best kernel found by agent {agent} with time {best_time:.6f}s")

                candidates.append((exec_time, kernel_code, block_size))  # Always append

            # Swarm-inspired selection: keep top 3
            candidates.sort()  # Sorts by the first element (exec_time)
            self.knowledge_archive.append(candidates[:3])

        return best_kernel

# Run the swarm-based optimizer with debugging
optimizer = SwarmCUDAOptimizer(N=64, num_agents=5, max_iters=10)
best_kernel_code = optimizer.optimize()

# Show the best discovered CUDA kernel
print("\nBest Discovered CUDA Kernel:\n")
print(best_kernel_code)

Iteration 1/10
Kernel Execution Time: 0.143615s | Block Size: 32 | Grid Size: (2, 2)
New best kernel found by agent 0 with time 0.143615s
Kernel Execution Time: 0.000145s | Block Size: 32 | Grid Size: (2, 2)
New best kernel found by agent 1 with time 0.000145s
Kernel Execution Time: 0.000026s | Block Size: 8 | Grid Size: (8, 8)
New best kernel found by agent 2 with time 0.000026s
Kernel Execution Time: 0.054827s | Block Size: 16 | Grid Size: (4, 4)
Kernel Execution Time: 0.052365s | Block Size: 8 | Grid Size: (8, 8)
Iteration 2/10
Kernel Execution Time: 0.000025s | Block Size: 32 | Grid Size: (2, 2)
New best kernel found by agent 0 with time 0.000025s
Kernel Execution Time: 0.000018s | Block Size: 16 | Grid Size: (4, 4)
New best kernel found by agent 1 with time 0.000018s
Kernel Execution Time: 0.000014s | Block Size: 8 | Grid Size: (8, 8)
New best kernel found by agent 2 with time 0.000014s
Kernel Execution Time: 0.000013s | Block Size: 8 | Grid Size: (8, 8)
New best kernel found by a

In [2]:
import cupy as cp
import numpy as np
import time
import random

def check_cuda_availability():
    """Check if CUDA is available and initialize the device"""
    try:
        device = cp.cuda.Device(0)  # Get the current device
        device.synchronize()  # Ensure device is ready
        return True
    except cp.cuda.runtime.CUDARuntimeError as e:
        print(f"CUDA Error: {e}")
        return False
    except Exception as e:  # Catch other potential exceptions
        print(f"Unexpected CUDA initialization error: {e}")
        return False

def generate_kernel_template(block_size):
    return f"""
    extern "C" __global__
    void matmul_kernel(const float *A, const float *B, float *C, const int N) {{
        const int tx = threadIdx.x;
        const int ty = threadIdx.y;
        const int bx = blockIdx.x;
        const int by = blockIdx.y;

        const int row = by * {block_size} + ty;
        const int col = bx * {block_size} + tx;

        if(row < N && col < N) {{
            float sum = 0.0f;
            for(int k = 0; k < N; k++) {{
                sum += A[row * N + k] * B[k * N + col];
            }}
            C[row * N + col] = sum;
        }}
    }}
    """

class SwarmCUDAOptimizer:
    def __init__(self, N=64):
        if not check_cuda_availability():
            print("CUDA is not available. Exiting.")
            self.N = None  # Set N to None to indicate CUDA failure
            return

        self.N = N
        self.best_kernel = None
        self.best_block_size = None
        self.best_time = float('inf')

        print("Generating test data...")
        self.A_host = np.random.rand(self.N, self.N).astype(np.float32)
        self.B_host = np.random.rand(self.N, self.N).astype(np.float32)

        try:
            with cp.cuda.Device(0):
                self.A = cp.asarray(self.A_host)
                self.B = cp.asarray(self.B_host)
                print("Data successfully transferred to GPU")
        except Exception as e:
            print(f"Failed to transfer data to GPU: {e}")  #Print and return
            self.N = None #Set N to none to avoid operations
            return

    def evaluate_kernel(self, block_size):
        try:
            kernel_code = generate_kernel_template(block_size)
            kernel = cp.RawKernel(kernel_code, 'matmul_kernel')

            C = cp.zeros((self.N, self.N), dtype=cp.float32)
            grid_dim = (self.N + block_size - 1) // block_size

            with cp.cuda.Device(0):
                start = time.perf_counter()
                kernel((grid_dim, grid_dim), (block_size, block_size), (self.A, self.B, C, self.N))
                cp.cuda.Stream.null.synchronize()
                end = time.perf_counter()

                execution_time = end - start

                if cp.any(cp.isnan(C)):
                    print("Warning: NaN values detected in output")
                    return float('inf')

                return execution_time

        except Exception as e:
            print(f"Error evaluating kernel: {e}")
            return float('inf')

    def optimize(self):
        block_sizes = [8, 16, 32]
        iterations = 5

        print("\nStarting optimization...")
        for iteration in range(iterations):
            print(f"Iteration {iteration + 1}/{iterations}")
            for block_size in block_sizes:
                time = self.evaluate_kernel(block_size)

                if time < self.best_time:
                    self.best_time = time
                    self.best_block_size = block_size
                    self.best_kernel = generate_kernel_template(block_size)
                    print(f"New best configuration - Block size: {block_size}, Time: {time:.6f}s")

        return self.best_kernel, self.best_block_size, self.best_time

# Run optimization with error handling
try:
    print("Initializing CUDA optimizer...")
    optimizer = SwarmCUDAOptimizer(N=64)
    if optimizer.N is None:  # Check if CUDA initialization failed
        exit()
    best_kernel, best_block_size, best_time = optimizer.optimize()

    # Compare with NumPy baseline
    if best_kernel:
        print("\nRunning NumPy comparison...")
        start_time = time.time()
        np.dot(optimizer.A_host, optimizer.B_host)  # Use A_host and B_host for NumPy
        numpy_time = time.time() - start_time

        print(f"\nFinal Results:")
        print(f"Best CUDA time: {best_time:.6f}s")
        print(f"NumPy time: {numpy_time:.6f}s")
        print(f"Best block size: {best_block_size}")
        if best_time < numpy_time:
            print(f"Speedup over NumPy: {numpy_time/best_time:.2f}x")

except Exception as e:
    print(f"Error during optimization: {e}")

Initializing CUDA optimizer...
Generating test data...
Data successfully transferred to GPU

Starting optimization...
Iteration 1/5
New best configuration - Block size: 8, Time: 0.049121s
New best configuration - Block size: 16, Time: 0.045454s
New best configuration - Block size: 32, Time: 0.044713s
Iteration 2/5
New best configuration - Block size: 8, Time: 0.000018s
New best configuration - Block size: 16, Time: 0.000014s
Iteration 3/5
New best configuration - Block size: 8, Time: 0.000013s
New best configuration - Block size: 16, Time: 0.000012s
Iteration 4/5
New best configuration - Block size: 8, Time: 0.000011s
Iteration 5/5
New best configuration - Block size: 8, Time: 0.000011s

Running NumPy comparison...

Final Results:
Best CUDA time: 0.000011s
NumPy time: 0.008014s
Best block size: 8
Speedup over NumPy: 712.97x


In [3]:
import cupy as cp
import numpy as np
import time
import random

def check_cuda_availability():
    """Check if CUDA is available and initialize the device"""
    try:
        device = cp.cuda.Device(0)  # Get the current device
        device.synchronize()  # Ensure device is ready
        return True
    except cp.cuda.runtime.CUDARuntimeError as e:
        print(f"CUDA Error: {e}")
        return False
    except Exception as e:
        print(f"Unexpected CUDA initialization error: {e}")
        return False

def generate_kernel_template(block_size):
    return f"""
    extern "C" __global__
    void matmul_kernel(const float *A, const float *B, float *C, const int N) {{
        const int tx = threadIdx.x;
        const int ty = threadIdx.y;
        const int bx = blockIdx.x;
        const int by = blockIdx.y;

        const int row = by * {block_size} + ty;
        const int col = bx * {block_size} + tx;

        if(row < N && col < N) {{
            float sum = 0.0f;
            for(int k = 0; k < N; k++) {{
                sum += A[row * N + k] * B[k * N + col];
            }}
            C[row * N + col] = sum;
        }}
    }}
    """

class SwarmCUDAFishOptimizer:
    def __init__(self, N=64):
        if not check_cuda_availability():
            print("CUDA is not available. Exiting.")
            self.N = None
            return

        self.N = N
        self.best_kernel = None
        self.best_block_size = None
        self.best_time = float('inf')

        print("Generating test data...")
        self.A_host = np.random.rand(self.N, self.N).astype(np.float32)
        self.B_host = np.random.rand(self.N, self.N).astype(np.float32)

        try:
            with cp.cuda.Device(0):
                self.A = cp.asarray(self.A_host)
                self.B = cp.asarray(self.B_host)
                print("Data successfully transferred to GPU")
        except Exception as e:
            print(f"Failed to transfer data to GPU: {e}")
            self.N = None
            return

    def evaluate_kernel(self, block_size):
        try:
            kernel_code = generate_kernel_template(block_size)
            kernel = cp.RawKernel(kernel_code, 'matmul_kernel')

            C = cp.zeros((self.N, self.N), dtype=cp.float32)
            grid_dim = (self.N + block_size - 1) // block_size

            with cp.cuda.Device(0):
                start = time.perf_counter()
                kernel((grid_dim, grid_dim), (block_size, block_size), (self.A, self.B, C, self.N))
                cp.cuda.Stream.null.synchronize()
                end = time.perf_counter()

                execution_time = end - start

                if cp.any(cp.isnan(C)):
                    print("Warning: NaN values detected in output")
                    return float('inf')

                return execution_time

        except Exception as e:
            print(f"Error evaluating kernel: {e}")
            return float('inf')

    def viscosity(self, time):
        """Define viscosity as an inverse function of execution time"""
        return 1 / (time + 1e-6)

    def optimize(self):
        block_sizes = [8, 16, 32]
        fish_population = [{"size": b, "velocity": 0.1, "time": float('inf')} for b in block_sizes]
        iterations = 10
        max_jump = 8  # Maximum random movement per iteration

        print("\nStarting Fish Swarm Optimization...")
        for iteration in range(iterations):
            print(f"Iteration {iteration + 1}/{iterations}")

            for fish in fish_population:
                block_size = fish["size"]
                execution_time = self.evaluate_kernel(block_size)

                # Update best solution
                if execution_time < self.best_time:
                    self.best_time = execution_time
                    self.best_block_size = block_size
                    self.best_kernel = generate_kernel_template(block_size)
                    print(f"New best - Block size: {block_size}, Time: {execution_time:.6f}s")

                # Compute viscosity-based movement
                fish["time"] = execution_time
                fish["velocity"] = self.viscosity(execution_time)

            # Apply fluid-based movement
            avg_time = sum(f["time"] for f in fish_population) / len(fish_population)
            for fish in fish_population:
                viscosity_factor = self.viscosity(fish["time"]) / self.viscosity(avg_time)

                # Fish move toward faster execution zones
                if random.random() < viscosity_factor:
                    fish["size"] = max(8, min(32, fish["size"] + random.randint(-max_jump, max_jump)))

        return self.best_kernel, self.best_block_size, self.best_time

# Run Fish Swarm Optimization with error handling
try:
    print("Initializing CUDA optimizer with Fish Swarm Dynamics...")
    optimizer = SwarmCUDAFishOptimizer(N=64)
    if optimizer.N is None:
        exit()
    best_kernel, best_block_size, best_time = optimizer.optimize()

    # Compare with NumPy baseline
    if best_kernel:
        print("\nRunning NumPy comparison...")
        start_time = time.time()
        np.dot(optimizer.A_host, optimizer.B_host)
        numpy_time = time.time() - start_time

        print(f"\nFinal Results:")
        print(f"Best CUDA time: {best_time:.6f}s")
        print(f"NumPy time: {numpy_time:.6f}s")
        print(f"Best block size: {best_block_size}")
        if best_time < numpy_time:
            print(f"Speedup over NumPy: {numpy_time/best_time:.2f}x")

except Exception as e:
    print(f"Error during optimization: {e}")

Initializing CUDA optimizer with Fish Swarm Dynamics...
Generating test data...
Data successfully transferred to GPU

Starting Fish Swarm Optimization...
Iteration 1/10
New best - Block size: 8, Time: 0.000043s
New best - Block size: 16, Time: 0.000017s
Iteration 2/10
New best - Block size: 8, Time: 0.000014s
Iteration 3/10
Iteration 4/10
Iteration 5/10
New best - Block size: 13, Time: 0.000012s
Iteration 6/10
Iteration 7/10
Iteration 8/10
Iteration 9/10
Iteration 10/10

Running NumPy comparison...

Final Results:
Best CUDA time: 0.000012s
NumPy time: 0.000026s
Best block size: 13
Speedup over NumPy: 2.16x
