In [1]:
%%writefile partialSum.cu
#include <stdio.h>

__global__ void partialSumKernel(int *input, int *output, int n) {
    // Shared memory
    extern __shared__ int sharedMemory[];

    int tid = threadIdx.x;
    int index = blockIdx.x * blockDim.x*2 + tid;

    if (index < n) {
        // Load input into shared memory and optimize the loading to do coalescing
        sharedMemory[tid] = input[index]+input[index+blockDim.x];
        __syncthreads();

        // Perform inclusive scan in shared memory
        for (int stride = 1; stride < blockDim.x; stride *= 2) {
            int temp = 0;
            if (tid >= stride) {
                temp = sharedMemory[tid - stride];
            }
            __syncthreads();
            sharedMemory[tid] += temp;
            __syncthreads();
            printf("Block %d, Thread %d, Stride %d, Shared Memory Value: %d\n", blockIdx.x, tid, stride, sharedMemory[tid]);
        }

        // Write result to global memory
        output[index] = sharedMemory[tid];
    }
}

int main() {
    const int N = 16;
    const int blockSize = 8;

    int h_input[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
    int h_output[N];

    int *d_input, *d_output;
    size_t size = N * sizeof(int);

    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    partialSumKernel<<<N / blockSize, blockSize, blockSize * sizeof(int)>>>(d_input, d_output, N);

    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);


    printf("Input: ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_input[i]);
    }
    printf("\nOutput: ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_output[i]);
    }
    printf("\n");

    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

Writing partialSum.cu


In [2]:
# Compile with the specified architecture
!nvcc partialSum.cu -o partialSum -gencode arch=compute_75,code=sm_75

# Run the executable
!./partialSum

Block 0, Thread 0, Stride 1, Shared Memory Value: 10
Block 0, Thread 1, Stride 1, Shared Memory Value: 22
Block 0, Thread 2, Stride 1, Shared Memory Value: 26
Block 0, Thread 3, Stride 1, Shared Memory Value: 30
Block 0, Thread 4, Stride 1, Shared Memory Value: 34
Block 0, Thread 5, Stride 1, Shared Memory Value: 38
Block 0, Thread 6, Stride 1, Shared Memory Value: 42
Block 0, Thread 7, Stride 1, Shared Memory Value: 46
Block 0, Thread 0, Stride 2, Shared Memory Value: 10
Block 0, Thread 1, Stride 2, Shared Memory Value: 22
Block 0, Thread 2, Stride 2, Shared Memory Value: 36
Block 0, Thread 3, Stride 2, Shared Memory Value: 52
Block 0, Thread 4, Stride 2, Shared Memory Value: 60
Block 0, Thread 5, Stride 2, Shared Memory Value: 68
Block 0, Thread 6, Stride 2, Shared Memory Value: 76
Block 0, Thread 7, Stride 2, Shared Memory Value: 84
Block 0, Thread 0, Stride 4, Shared Memory Value: 10
Block 0, Thread 1, Stride 4, Shared Memory Value: 22
Block 0, Thread 2, Stride 4, Shared Memory Val