In [None]:
%%writefile vectoraddby2.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

__global__ void add(const float *a, float *c, int size){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for(; idx<size; idx += blockDim.x * gridDim.x){
        c[idx] = a[idx] + 2.0f; // add 2.0 to each element
    }
}

int main() {
    const int N = 10000;
    size_t size = N * sizeof(float);

    // allocate memory on the host
    float *h_a = (float *)malloc(size);
    float *h_c = (float *)malloc(size);

    if (!h_a || !h_c) {
        printf("Host memory allocation failed!\n");
        return 1;
    }

    srand((unsigned int)time(NULL));   // seed for random number generation
    // initialize input data
    for(int i=0; i<N; i++){
        h_a[i] = rand() % 100; // random values between 0 and 99
    }

    // allocate memory on the device
    float *d_a, *d_c;
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_c, size);

    // copy data from host to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

    int threadsperblock = 256;
    int blocks = (N + threadsperblock - 1) / threadsperblock;
    // kernel
    add<<<blocks, threadsperblock>>>(d_a,d_c,N);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess){
        printf("CUDA error: %s\n", cudaGetErrorString(err));
        return 1;
    }

    cudaDeviceSynchronize();

    // cpy results back from device to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    for (int i=0; i<5; i++){
        printf("%f + 2.0 = %f\n", h_a[i], h_c[i]);
    }

    // Free all Memory
    cudaFree(d_a);
    cudaFree(d_c);
    free(h_a);
    free(h_c);

    return 0;
}

Writing vectoraddby2.cu


In [None]:
!nvcc -arch=sm_75 vectoraddby2.cu -o vectoraddby2

In [None]:
!./vectoraddby2

89.000000 + 2.0 = 91.000000
91.000000 + 2.0 = 93.000000
87.000000 + 2.0 = 89.000000
62.000000 + 2.0 = 64.000000
25.000000 + 2.0 = 27.000000


In [None]:
%%writefile arrayMulCPUvsGPU.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include <chrono>

// Kernel
__global__ void arrayMul(const float *a, const float *b, float *c, int size){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for(; idx < size; idx+= blockDim.x * gridDim.x){
        c[idx] = a[idx] * b[idx];
    }
}

void arrayMulCPU(const float *a, const float *b, float *c, int size){
    for(int i=0; i<size; i++){
        c[i] = a[i] * b[i];
    }
}

int main(){
    const int N = 10000;
    size_t size = N * sizeof(float);

    const int blockSizes[] = {64, 128, 256};
    const int numTests = 3;

    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_c = (float *)malloc(size);
    float *h_c_cpu = (float *)malloc(size);

    for(int i=0; i<N; i++){
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    auto cpu_start = std::chrono::high_resolution_clock::now();
    arrayMulCPU(h_a, h_b, h_c_cpu, N);
    auto cpu_end = std::chrono::high_resolution_clock::now();
    float cpu_time = std::chrono::duration<float, std::milli>(cpu_end - cpu_start).count();
    printf("CPU execution time : %.3f ms\n", cpu_time);

    for (int i=0; i< numTests; i++){
        int threadsPerBlock = blockSizes[i];
        int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;

        printf("\nTesting block size: %d (blocks: %d)\n", threadsPerBlock, blocks);

        dim3 dimBlock(threadsPerBlock);
        dim3 dimGrid(blocks);

        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        cudaEventRecord(start);
        arrayMul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
        cudaEventRecord(stop);

        cudaEventSynchronize(stop);

        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("CUDA Error: %s\n", cudaGetErrorString(err));
        }

        float gpu_time = 0;
        cudaEventElapsedTime(&gpu_time, start, stop);
        printf("GPU Time: %f ms\n", gpu_time);

        cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

        bool correct = true;
        for(int j=0; j<N; j++){
            if (fabs(h_c[j] - h_c_cpu[j]) > 1e-5) {
                printf("verification failed at index %d: GPU = %.2f, CPU = %.2f\n", j, h_c[j], h_c_cpu[j]);
                correct = false;
                break;
            }
        }

        if (correct){
            printf("Result verification: SUCCESS\n");
            if(gpu_time > 0.0)
                printf("speedup: %.2fX\n", cpu_time / gpu_time);
        }

        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);
    free(h_c_cpu);

    return 0;
}

Writing arrayMulCPUvsGPU.cu


In [None]:
!nvcc -arch=sm_75 arrayMulCPUvsGPU.cu -o arrayMulCPUvsGPU

In [None]:
!./arrayMulCPUvsGPU

CPU execution time : 0.058 ms

Testing block size: 64 (blocks: 157)
GPU Time: 0.916864 ms
Result verification: SUCCESS
speedup: 0.06X

Testing block size: 128 (blocks: 79)
GPU Time: 0.015136 ms
Result verification: SUCCESS
speedup: 3.80X

Testing block size: 256 (blocks: 40)
GPU Time: 0.012992 ms
Result verification: SUCCESS
speedup: 4.43X


In [4]:
%%writefile matrixAddition.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel for matrix addition
__global__ void matrixAdd(const float *a, float *b, float *c, int rows, int cols){
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int stride_x = blockDim.x * gridDim.x;
    int stride_y = blockDim.y * gridDim.y;

    for(; row < rows; row += stride_y){
        for (int j = col; j < cols; j+= stride_x){
            int idx = row * cols + j;
            c[idx] = a[idx] + b[idx];
        }
    }
}

int main(){
    // Define matrix dimensions
    const int ROWS = 100;
    const int COLS = 100;
    size_t size = ROWS * COLS * sizeof(float);
    const dim3 blockSizes[] = {dim3(16, 16), dim3(32, 32)};
    int numtests = 2;

    // allocate memory for matrices
    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_c = (float *)malloc(size);

    // Initialize matrices
    for (int i = 0; i < ROWS * COLS; i++){
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
    }

    // Allocate device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Copy matrices from host to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    for (int t=0; t< numtests; t++){
        dim3 blockSize = blockSizes[t];
        dim3 gridSize((COLS + blockSize.x - 1) / blockSize.x, (ROWS + blockSize.y - 1) / blockSize.y);
        printf("Testing block size: %dx%d (grid: %d x %d)\n", blockSize.x, blockSize.y, gridSize.x, gridSize.y);


        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        cudaEventRecord(start);
        matrixAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, ROWS, COLS);

        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess){
            printf("CUDA error: %s\n", cudaGetErrorString(err));
            return -1;
        }

        cudaEventRecord(stop);
        cudaEventSynchronize(stop);

        float gpu_time = 0;
        cudaEventElapsedTime(&gpu_time, start, stop);
        printf("GPU execution time: %.3f ms\n", gpu_time);

        // Copy result back to host
        cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

        // Verify results
        printf("First 5 result (row 0):/n");
        for (int j=0; j<5; j++){
            int idx = 0 * COLS + j;
			printf("c[0][%d] = %.2f (a[0][%d] = %.2f + b[0][%d] = %.2f)\n", j, h_c[idx], j, h_a[idx], j, h_b[idx]);
        }

        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

Overwriting matrixAddition.cu


In [5]:
!nvcc -arch=sm_75 matrixAddition.cu -o matrixAddition

In [6]:
!./matrixAddition

Testing block size: 16x16 (grid: 7 x 7)
GPU execution time: 0.124 ms
First 5 result (row 0):/nc[0][0] = 1.23 (a[0][0] = 0.84 + b[0][0] = 0.39)
c[0][1] = 1.58 (a[0][1] = 0.78 + b[0][1] = 0.80)
c[0][2] = 1.11 (a[0][2] = 0.91 + b[0][2] = 0.20)
c[0][3] = 1.10 (a[0][3] = 0.34 + b[0][3] = 0.77)
c[0][4] = 0.83 (a[0][4] = 0.28 + b[0][4] = 0.55)
Testing block size: 32x32 (grid: 4 x 4)
GPU execution time: 0.016 ms
First 5 result (row 0):/nc[0][0] = 1.23 (a[0][0] = 0.84 + b[0][0] = 0.39)
c[0][1] = 1.58 (a[0][1] = 0.78 + b[0][1] = 0.80)
c[0][2] = 1.11 (a[0][2] = 0.91 + b[0][2] = 0.20)
c[0][3] = 1.10 (a[0][3] = 0.34 + b[0][3] = 0.77)
c[0][4] = 0.83 (a[0][4] = 0.28 + b[0][4] = 0.55)
