In [None]:
%%writefile vectoradd.cu

//using malloc

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

int main(int argc, char *argv[]) {
    int *h_a, *h_b, *h_c;
    int *d_a, *d_b, *d_c;
    int n = 1024; // default size

    if (argc > 1) n = atoi(argv[1]);
    printf("Vector size: %d elements\n", n);

    size_t size = n * sizeof(int);

    // Timing variables
    cudaEvent_t start, stop;
    clock_t host_start, host_end;
    float kernel_time = 0.0f;

    // Create CUDA events
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Start host timer
    host_start = clock();

    // Host memory allocation
    h_a = (int*)malloc(size);
    h_b = (int*)malloc(size);
    h_c = (int*)malloc(size);

    // Initialize with random values
    srand(time(NULL));
    for(int i = 0; i < n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    // Device memory allocation
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    // Copy data to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Kernel launch configuration
    int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

    // Launch kernel with timing
    cudaEventRecord(start);
    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(d_a, d_b, d_c, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calculate kernel time
    cudaEventElapsedTime(&kernel_time, start, stop);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Stop host timer
    host_end = clock();

    // Verification (optional)
    printf("\nSample results (first 5 elements):\n");
    for(int i = 0; i < 5; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // Print timing results
    printf("\nTiming statistics:\n");
    printf("Kernel execution time: %.4f ms\n", kernel_time);
    printf("Total host time: %.4f ms\n",
          ((double)(host_end - host_start) / CLOCKS_PER_SEC) * 1000);

    // Cleanup
    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}



Writing vectoradd.cu


In [None]:
!nvcc -o vectoradd vectoradd.cu -lcurand -arch=sm_75
!./vectoradd

Vector size: 1024 elements

Sample results (first 5 elements):
71 + 2 = 0
19 + 57 = 0
2 + 52 = 0
29 + 4 = 0
52 + 46 = 0

Timing statistics:
Kernel execution time: 0.0000 ms
Total host time: 0.0920 ms


Q1 assign 4

In [None]:
%%writefile sum.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1024

__global__ void sumKernel(int *input, int *output) {
    int tid = threadIdx.x;
    if (tid == 0) {
        // Task A: Iterative Sum
        int sum = 0;
        for (int i = 0; i < N; i++) {
            sum += input[i];
        }
        output[0] = sum;
    }
    else if (tid == 1) {
        output[1] = (N * (N - 1)) / 2;
    }
}

int main() {
    int h_input[N], h_output[2] = {0};

    for (int i = 0; i < N; i++) {
        h_input[i] = i;
    }

    int *d_input, *d_output;
    cudaMalloc(&d_input, N * sizeof(int));
    cudaMalloc(&d_output, 2 * sizeof(int));

    cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    float milliseconds = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    sumKernel<<<1, 2>>>(d_input, d_output);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_output, d_output, 2 * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Sum using Iteration (Thread 0): %d\n", h_output[0]);
    printf("Sum using Formula   (Thread 1): %d\n", h_output[1]);
    printf("GPU Kernel Execution Time: %.6f ms\n", milliseconds);

    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Writing sum.cu


In [None]:
!nvcc --gpu-architecture=sm_70 sum.cu -o sum
!./sum

Sum using Iteration (Thread 0): 0
Sum using Formula   (Thread 1): 0
GPU Kernel Execution Time: 0.000000 ms


Q2 assign 4

In [None]:
%%writefile merge.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__device__ void merge(int *arr, int l, int m, int r, int *temp) {
    int i = l, j = m + 1, k = 0;

    while (i <= m && j <= r) {
        if (arr[i] <= arr[j]) {
            temp[k++] = arr[i++];
        } else {
            temp[k++] = arr[j++];
        }
    }

    while (i <= m) temp[k++] = arr[i++];
    while (j <= r) temp[k++] = arr[j++];

    for (i = l, k = 0; i <= r; i++, k++) {
        arr[i] = temp[k];
    }
}

__global__ void mergeSortKernel(int *arr, int n, int step) {
    extern __shared__ int temp[];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int l = idx * step * 2;
    int m = l + step - 1;
    int r = min(l + step * 2 - 1, n - 1);

    if (l < n && m < n && r < n) {
        merge(arr, l, m, r, temp);
    }
}

void cudaMergeSort(int *h_arr, int n) {
    int *d_arr;
    size_t size = n * sizeof(int);

    // Allocate device memory
    cudaMalloc(&d_arr, size);
    cudaMemcpy(d_arr, h_arr, size, cudaMemcpyHostToDevice);

    // Kernel configuration
    dim3 threadsPerBlock(BLOCK_SIZE);
    dim3 blocksPerGrid((n + threadsPerBlock.x - 1) / threadsPerBlock.x);

    // Perform iterative merge sort
    for (int step = 1; step < n; step *= 2) {
        mergeSortKernel<<<blocksPerGrid, threadsPerBlock, BLOCK_SIZE * sizeof(int)>>>(d_arr, n, step);
        cudaDeviceSynchronize();
    }

    // Copy sorted array back to host
    cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_arr);
}

int main() {
    int n = 1000;
    int *arr = (int*)malloc(n * sizeof(int));

    // Initialize array with random values
    srand(time(NULL));
    for (int i = 0; i < n; i++) {
        arr[i] = rand() % 1000;
    }

    printf("Unsorted Array:\n");
    for (int i = 0; i < 10; i++) { // Print first 10 elements
        printf("%d ", arr[i]);
    }
    printf("\n");

    // Time CUDA Merge Sort
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    cudaMergeSort(arr, n);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    float milliseconds;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Sorted Array:\n");
    for (int i = 0; i < 10; i++) { // Print first 10 elements
        printf("%d ", arr[i]);
    }

    printf("\nCUDA Merge Sort Time: %.4f ms\n", milliseconds);

    free(arr);

    return 0;
}


Writing merge.cu


In [None]:
!nvcc -o merge merge.cu -lcurand -arch=sm_75
!./merge

Unsorted Array:
997 771 589 278 528 560 53 219 100 249 
Sorted Array:
997 771 589 278 528 560 53 219 100 249 
CUDA Merge Sort Time: 0.0000 ms


Assignment 5

In [17]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda.h>

#define N 1024  // Size of vectors

// Use unified memory accessible from both host and device
__managed__ float A[N], B[N], C[N];

// CUDA kernel for vector addition
__global__ void vectorAdd() {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

// Error checker utility
void check(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        printf("CUDA error %s: %s\n", msg, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

int main() {
    // Initialize input arrays
    for (int i = 0; i < N; ++i) {
        A[i] = i * 1.0f;
        B[i] = i * 2.0f;
    }

    // CUDA event setup for timing
    cudaEvent_t start, stop;
    float time_ms = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch the kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>();
    check(cudaGetLastError(), "Kernel launch");

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_ms, start, stop);

    // Print first 5 results
    printf("C = [");
    for (int i = 0; i < 5; ++i)
        printf("%.1f ", C[i]);
    printf("...]\n");

    // Device properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    float memClock = prop.memoryClockRate * 1e3;   // Hz
    float busWidth = prop.memoryBusWidth;          // bits
    float theoreticalBW = 2 * memClock * busWidth / 8 / 1e9;  // GB/s
    printf("Theoretical Bandwidth: %.2f GB/s\n", theoreticalBW);

    // Actual bandwidth
    float totalBytes = 3 * N * sizeof(float); // A & B read, C written
    float measuredBW = totalBytes / (time_ms / 1000.0f) / 1e9; // GB/s
    printf("Measured Bandwidth: %.2f GB/s\n", measuredBW);
    printf("Execution Time: %.4f ms\n", time_ms);

    return 0;
}


Overwriting vector_add.cu


In [18]:
!nvcc --gpu-architecture=sm_70 vector_add.cu -o vector_add
!./vector_add