In [None]:
%%writefile iterative_sum.cu
#include <stdio.h>
#include <cuda.h>

#define N 1024

__global__ void iterativeSumKernel(int *input, int *output) {
    int sum = 0;
    for (int i = 0; i < N; ++i) {
        sum += input[i];
    }
    *output = sum;
}

int main() {
    int h_input[N], h_output;
    int *d_input, *d_output;

    // Fill input
    for (int i = 0; i < N; ++i)
        h_input[i] = i;

    size_t input_size = N * sizeof(int);
    size_t output_size = sizeof(int);

    printf("Iterative Sum\n");
    printf("Allocating %zu bytes for input\n", input_size);
    printf("Allocating %zu bytes for output\n", output_size);

    cudaMalloc((void**)&d_input, input_size);
    cudaMalloc((void**)&d_output, output_size);

    cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);

    // Timing
    cudaEvent_t start, stop;
    float ms = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Kernel call
    iterativeSumKernel<<<1, 1>>>(d_input, d_output);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(&h_output, d_output, output_size, cudaMemcpyDeviceToHost);

    printf("Result: %d\n", h_output);
    printf("Time taken (iterative): %.6f ms\n", ms);

    cudaFree(d_input);
    cudaFree(d_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing iterative_sum.cu


In [None]:
%%writefile formula_sum.cu
#include <stdio.h>
#include <cuda.h>

#define N 1024

__global__ void formulaSumKernel(int *output) {
    *output = N * (N - 1) / 2;
}

int main() {
    int h_output;
    int *d_output;

    size_t output_size = sizeof(int);

    printf("Direct Formula Sum\\n");
    printf("Allocating %zu bytes for output\\n", output_size);

    cudaMalloc((void**)&d_output, output_size);

    // Timing
    cudaEvent_t start, stop;
    float ms = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Kernel call
    formulaSumKernel<<<1, 1>>>(d_output);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(&h_output, d_output, output_size, cudaMemcpyDeviceToHost);

    printf("Result: %d\\n", h_output);
    printf("Time taken (formula): %.6f ms\\n", ms);

    cudaFree(d_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing formula_sum.cu


In [None]:
!nvcc --gpu-architecture=sm_70 iterative_sum.cu -o iterative_sum
!nvcc --gpu-architecture=sm_70 formula_sum.cu -o formula_sum

!./iterative_sum
!./formula_sum



Iterative Sum\nAllocating 4096 bytes for input\nAllocating 4 bytes for output\nResult: 523776\nTime taken (iterative): 0.200320 ms\nDirect Formula Sum\nAllocating 4 bytes for output\nResult: 523776\nTime taken (formula): 0.102368 ms\n

In [None]:
%%writefile cpu_mergesort.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 1000

void merge(int arr[], int left, int mid, int right) {
    int i, j, k;
    int n1 = mid - left + 1;
    int n2 = right - mid;

    int* L = (int*)malloc(n1 * sizeof(int));
    int* R = (int*)malloc(n2 * sizeof(int));

    for (i = 0; i < n1; i++) L[i] = arr[left + i];
    for (j = 0; j < n2; j++) R[j] = arr[mid + 1 + j];

    i = 0; j = 0; k = left;

    while (i < n1 && j < n2) {
        if (L[i] <= R[j])
            arr[k++] = L[i++];
        else
            arr[k++] = R[j++];
    }

    while (i < n1) arr[k++] = L[i++];
    while (j < n2) arr[k++] = R[j++];

    free(L);
    free(R);
}

void mergeSort(int arr[], int left, int right) {
    if (left < right) {
        int mid = left + (right - left) / 2;

        mergeSort(arr, left, mid);
        mergeSort(arr, mid + 1, right);
        merge(arr, left, mid, right);
    }
}

int main() {
    int arr[N];
    srand(42); // fixed seed for comparison

    for (int i = 0; i < N; i++)
        arr[i] = rand() % 10000;

    clock_t start = clock();

    mergeSort(arr, 0, N - 1);

    clock_t end = clock();

    printf("CPU Merge Sort Time: %.6f ms\\n", 1000.0 * (end - start) / CLOCKS_PER_SEC);

    // Optional: verify it's sorted
    for (int i = 1; i < N; i++) {
        if (arr[i - 1] > arr[i]) {
            printf("Error: Array not sorted.\\n");
            return 1;
        }
    }

    printf("CPU Merge Sort completed successfully.\\n");
    return 0;
}


Writing cpu_mergesort.cu


In [None]:
%%writefile gpu_mergesort.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 1024 // Must be power of 2 for simplicity
#define BLOCK_SIZE 512

__global__ void mergeKernel(int* d_input, int* d_output, int width) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int start = tid * 2 * width;

    if (start + width >= N) return;

    int mid = start + width - 1;
    int end = (start + 2 * width - 1 < N) ? (start + 2 * width - 1) : (N - 1);

    int i = start;
    int j = mid + 1;
    int k = start;

    while (i <= mid && j <= end) {
        if (d_input[i] <= d_input[j])
            d_output[k++] = d_input[i++];
        else
            d_output[k++] = d_input[j++];
    }

    while (i <= mid) d_output[k++] = d_input[i++];
    while (j <= end) d_output[k++] = d_input[j++];
}

int main() {
    int h_input[N];
    int* d_input, * d_output;

    srand(42);
    for (int i = 0; i < N; i++)
        h_input[i] = rand() % 10000;

    cudaMalloc(&d_input, N * sizeof(int));
    cudaMalloc(&d_output, N * sizeof(int));
    cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    int* temp;
    for (int width = 1; width < N; width *= 2) {
        int numBlocks = (N / (2 * width));
        mergeKernel<<<numBlocks, 1>>>(d_input, d_output, width);

        cudaDeviceSynchronize();
        // Swap pointers for next level
        temp = d_input;
        d_input = d_output;
        d_output = temp;
    }

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);

    cudaMemcpy(h_input, d_input, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("GPU Merge Sort Time: %.6f ms\\n", time);

    // Verify
    for (int i = 1; i < N; i++) {
        if (h_input[i - 1] > h_input[i]) {
            printf("Error: Array not sorted.\\n");
            return 1;
        }
    }

    printf("GPU Merge Sort completed successfully.\\n");

    cudaFree(d_input);
    cudaFree(d_output);
    return 0;
}


Writing gpu_mergesort.cu


In [None]:
!nvcc --gpu-architecture=sm_70 cpu_mergesort.cu -o cpu_mergesort
!nvcc --gpu-architecture=sm_70 gpu_mergesort.cu -o gpu_mergesort

!./cpu_mergesort
!./gpu_mergesort


CPU Merge Sort Time: 0.164000 ms\nCPU Merge Sort completed successfully.\nGPU Merge Sort Time: 0.610464 ms\nGPU Merge Sort completed successfully.\n

In [None]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda.h>

#define N 1024  // Size of vectors

// 1.1 Statically defined global memory
__device__ __managed__ float A[N], B[N], C[N];

// Vector addition kernel
__global__ void vectorAdd() {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    // Initialize input vectors
    for (int i = 0; i < N; i++) {
        A[i] = float(i);
        B[i] = float(2 * i);
    }

    // 1.2 Timing kernel execution
    cudaEvent_t start, stop;
    float milliseconds = 0;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    vectorAdd<<<(N + 255)/256, 256>>>();
    cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %.6f ms\\n", milliseconds);

    // Check result (optional)
    for (int i = 0; i < 5; i++)
        printf("C[%d] = %.1f\\n", i, C[i]);

    // 1.3 Query CUDA Device Properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    float memClock = prop.memoryClockRate * 2.0f; // DDR = double pumped
    float busWidth = prop.memoryBusWidth;

    // Convert to GB/s: (KHz * bits) → GB/s
    float theoreticalBW = (memClock * busWidth) / 8.0f / 1e6f;

    printf("Theoretical Bandwidth: %.2f GB/s\\n", theoreticalBW);

    // 1.4 Calculate Measured Bandwidth
    // RBytes = 2 * N * sizeof(float) [A and B]
    // WBytes = N * sizeof(float)     [C]
    float RBytes = 2 * N * sizeof(float);
    float WBytes = N * sizeof(float);
    float totalBytes = RBytes + WBytes;

    float measuredBW = totalBytes / (milliseconds / 1000.0f) / 1e9f;

    printf("Measured Bandwidth: %.2f GB/s\\n", measuredBW);

    return 0;
}


Overwriting vector_add.cu


In [None]:
!nvcc --gpu-architecture=sm_70 vector_add.cu -o vector_add
!./vector_add


Kernel execution time: 0.220256 ms\nC[0] = 0.0\nC[1] = 3.0\nC[2] = 6.0\nC[3] = 9.0\nC[4] = 12.0\nTheoretical Bandwidth: 320.06 GB/s\nMeasured Bandwidth: 0.06 GB/s\n