In [None]:
%%writefile Sigmoid.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

__global__ void Sigmoid(const float *A, float *O, int N){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < N){
        O[idx] = 1.0f / (1.0f + expf(-A[idx]));
    }
}

void solve(const float *A, float *O, int N){
    int threadsPerBlock = 256;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
    Sigmoid<<<blocks, threadsPerBlock>>>(A, O, N);
    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess){
        printf("Cuda Error: %s\n", cudaGetErrorString(err));
    }
}

int main(){
    const int N = 1 << 24;
    size_t size = N * sizeof(float);

    float *h_i = new float[N];
    float *h_o = new float[N];

    for (int i=0; i< N; i++){
        h_i[i] = -0.5f + 10.0f * static_cast<float>(rand()) / RAND_MAX;
    }

    float *d_i, *d_o;
    cudaMalloc((void **)&d_i, size);
    cudaMalloc((void **)&d_o, size);

    cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
    solve(d_i, d_o, N);
    cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);

    printf("First 5 sigmoid outputs: \n");
	for (int i = 0; i < 5 && i < N; i++) {
		printf("y[%d] = %f (x[%d] = %f)\n", i, h_o[i], i, h_i[i]);
	}

	cudaFree(d_i);
	cudaFree(d_o);
	delete[] h_i;
    delete[] h_o;
	return 0;
}

Overwriting Sigmoid.cu


In [None]:
!nvcc -arch=sm_75 Sigmoid.cu -o Sigmoid

In [None]:
!./Sigmoid

First 5 sigmoid outputs: 
y[0] = 0.999630 (x[0] = 7.901877)
y[1] = 0.969047 (x[1] = 3.443829)
y[2] = 0.999345 (x[2] = 7.330992)
y[3] = 0.999438 (x[3] = 7.484400)
y[4] = 0.999819 (x[4] = 8.616474)


In [None]:
%%writefile Relu.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

__global__ void Relu(const float *I, float *O, int N){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    for (int i=idx; i<N; i+= blockDim.x * gridDim.x){
        O[i] = fmaxf(0, I[i]);
    }
}
void solve(const float *I, float *O, int N){
    int threadsPerBlock = 256;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    Relu<<<blocks, threadsPerBlock>>>(I, O, N);
    cudaDeviceSynchronize();

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess){
        printf("Cuda Error %s\n:", cudaGetErrorString(err));
    }
}

int main(){
    const int N = 1 << 20;
    size_t size = N * sizeof(float);

    float *h_i = new float[N];
    float *h_o = new float[N];

    for (int t=0; t< N; t++){
        h_i[t] = -5.0f + 10.0f * static_cast<float>(rand()) / RAND_MAX;
    }

    float *d_i, *d_o;
    cudaMalloc((void **)&d_i, size);
    cudaMalloc((void **)&d_o, size);

    cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
    solve(d_i, d_o, N);
    cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);

    printf("First 5 ReLU outputs: \n");
	for (int i = 0; i < 5 && i < N; i++) {
		printf("O[%d] = %f (I[%d] = %f)\n", i, h_o[i], i, h_i[i]);
	}

	cudaFree(d_i);
	cudaFree(d_o);
	delete[] h_i;
    delete[] h_o;
	return 0;
}

Overwriting Relu.cu


In [None]:
!nvcc -arch=sm_75 Relu.cu -o Relu

In [None]:
!./Relu

First 5 ReLU outputs: 
O[0] = 3.401877 (I[0] = 3.401877)
O[1] = 0.000000 (I[1] = -1.056171)
O[2] = 2.830992 (I[2] = 2.830992)
O[3] = 2.984400 (I[3] = 2.984400)
O[4] = 4.116474 (I[4] = 4.116474)


In [None]:
%%writefile l2Norm.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA CHECK ERROR %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while (0)

// warp reduction using shuffle
static inline __device__ float warpReduceSum(float val){
    for (int offset = 16; offset > 0; offset >>= 1){
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

// Block Reduction using shared Memory
__device__ __forceinline__ float blockReduceSum(float val, float* shared) {
    int lane = threadIdx.x % 32;
    int wid = threadIdx.x / 32;

    // Warp-level reduction
    val = warpReduceSum(val);

    // Write reduced value to shared memory if first lane in warp
    if (lane == 0) shared[wid] = val;
    __syncthreads();

    // First warp reduces per-warp sums
    if (wid == 0) {
        val = (lane < (blockDim.x + 31) / 32) ? shared[lane] : 0.0f;
        val = warpReduceSum(val);
    }
    return val;
}


// Kernel-1:- Sum of Squares using vectorized loads (float4) + register accumulation
__global__ void L2SquaredSumKernel(const float * __restrict__ input, float *globalsum, int N){
    extern __shared__ float sdata[];

    float acc = 0.0f;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    // Vectorized load (float4 = 4 floats)
    int vecN = N / 4;
    const float4 *vinput = reinterpret_cast<const float4*>(input);

    // process 4 elements at once
    for (int i = idx; i < vecN; i += stride) {
        float4 v = vinput[i];
        acc += v.x * v.x;
        acc += v.y * v.y;
        acc += v.z * v.z;
        acc += v.w * v.w;
    }

    // Handle remaining elements
    for (int i = vecN * 4 + idx; i < N; i += stride) {
        float x = input[i];
        acc += x * x;
    }
    // Reduce within block
    acc = blockReduceSum(acc, sdata);

    if (threadIdx.x == 0){
        atomicAdd(globalsum, acc);
    }
}

// Kernel-2:- Normalized using Pre-computed L2 Norm(scalar), vectorized stores
__global__ void NormalizeKernel(const float * __restrict__ input, float * __restrict__ output, float invNorm, int N){
    int vecN = N / 4;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    const float4 *vinput = reinterpret_cast<const float4*>(input);
    float4 *voutput = reinterpret_cast<float4*>(output);

    for (int i = idx; i < vecN; i += stride) {
        float4 v = vinput[i];
        v.x *= invNorm;
        v.y *= invNorm;
        v.z *= invNorm;
        v.w *= invNorm;
        voutput[i] = v;
    }
    // tail elements
    for (int i = vecN * 4 + idx; i < N; i += stride) {
        output[i] = input[i] * invNorm;
    }
}

void l2_normalize_cuda(const float* d_input, float* d_output, int N) {
    const int threadsPerBlock = 256;
    const int blocks = (N + threadsPerBlock * 4 - 1) / (threadsPerBlock * 4);

    float *d_sum = nullptr;
    CUDA_CHECK(cudaMalloc(&d_sum, sizeof(float)));
    CUDA_CHECK(cudaMemset(d_sum, 0, sizeof(float)));

    // Shared memory: one float per warp
    int warpsPerBlock = (threadsPerBlock + 31) / 32;
    size_t smemBytes = warpsPerBlock * sizeof(float);

    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));

    CUDA_CHECK(cudaEventRecord(start));

    // Step 1: Compute sum of squares
    L2SquaredSumKernel<<<blocks, threadsPerBlock, smemBytes>>>(d_input, d_sum, N);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    // Step 2: Compute norm and inverse
    float h_sum = 0.0f;
    CUDA_CHECK(cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaFree(d_sum));

    float L2_norm = sqrtf(h_sum);
    float invNorm = (L2_norm > 1e-12f) ? 1.0f / L2_norm : 0.0f;

    // Step 3: Normalize
    NormalizeKernel<<<blocks, threadsPerBlock>>>(d_input, d_output, invNorm, N);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));

    float elapsed_ms = 0.0f;
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
    printf("L2 Normalization GPU time: %.3f ms (N = %d, %.2f M elements/sec)\n",
           elapsed_ms, N, N / (elapsed_ms * 1e3f));

    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
}

int main() {
    int N = 1 << 20; // 1048576
    size_t bytes = N * sizeof(float);

    float* h_input = (float*)malloc(bytes);
    float* h_output = (float*)malloc(bytes);

    if (!h_input || !h_output) { fprintf(stderr, "host alloc failed\n"); return 1; }

    for (int i = 0; i < N; ++i) {
        h_input[i] = static_cast<float>(rand()) / RAND_MAX * 2.0f - 1.0f;
    }

    float *d_input = nullptr, *d_output = nullptr;
    CUDA_CHECK(cudaMalloc(&d_input, bytes));
    CUDA_CHECK(cudaMalloc(&d_output, bytes));
    CUDA_CHECK(cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice));

    l2_normalize_cuda(d_input, d_output, N);

    // Copy back result
    CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));

    // Verify correctness: ||output|| should be ~1.0
    double sum_sq = 0.0;
    for (int i = 0; i < N; ++i) {
        sum_sq += h_output[i] * h_output[i];
    }
    printf("\nVerification: L2 norm of output = %.8f (should be ~1.0)\n", sqrt(sum_sq));

    printf("First 10 values:\n");
    for (int i = 0; i < 10 && i < N; ++i) {
        printf("in[%d] = %8.5f  -> out[%d] = %8.5f\n", i, h_input[i], i, h_output[i]);
    }

    // Cleanup
    CUDA_CHECK(cudaFree(d_input));
    CUDA_CHECK(cudaFree(d_output));
    free(h_input);
    free(h_output);

    CUDA_CHECK(cudaDeviceReset());
    return 0;
}

Writing l2Norm.cu


In [None]:
!nvcc -arch=sm_75 l2Norm.cu -o l2Norm

In [None]:
!./l2Norm

L2 Normalization GPU time: 0.397 ms (N = 1048576, 2639.39 M elements/sec)

Verification: L2 norm of output = 0.99999999 (should be ~1.0)
First 10 values:
in[0] =  0.68038  -> out[0] =  0.00115
in[1] = -0.21123  -> out[1] = -0.00036
in[2] =  0.56620  -> out[2] =  0.00096
in[3] =  0.59688  -> out[3] =  0.00101
in[4] =  0.82329  -> out[4] =  0.00139
in[5] = -0.60490  -> out[5] = -0.00102
in[6] = -0.32955  -> out[6] = -0.00056
in[7] =  0.53646  -> out[7] =  0.00091
in[8] = -0.44445  -> out[8] = -0.00075
in[9] =  0.10794  -> out[9] =  0.00018


In [None]:
%%writefile OptmizedL2Norm.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess){ \
            fprintf(stderr, "CUDA CHECK ERROR %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while (0)

// warp-reduction
__device__  __forceinline__ float warpReduceSum(float val){
    for (int offset= 16; offset>0; offset >>= 1){
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

// block reduction
__device__ __forceinline__ float blockReduceSum(float val, float* shared){
    int lane = threadIdx.x % 32;
    int wid = threadIdx.x / 32;

    // warp-level reduction
    val = warpReduceSum(val);

    // store warp results in shared memory
    if (lane == 0) shared[wid] = val;
    __syncthreads();

    // the final reduction
    if (wid == 0){
        val = (lane < (blockDim.x + 31) / 32) ? shared[lane] : 0.0f;
        val = warpReduceSum(val);
    }
    return val;
}

// kernel-1: sum of squares
__global__ void L2SquaredKernel(const float * __restrict__ input, float *globalsum, int N){
    extern __shared__ float sdata[];

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    float acc = 0.0f;

    // vectorized load
    int vecN = N / 4;
    const float4 *vinput = reinterpret_cast<const float4*>(input);

    // process 4 elements at once
    for (int i=idx; i<vecN; i+= stride){
        float4 v = vinput[i];
        acc += v.x * v.x;
        acc += v.y * v.y;
        acc += v.z * v.z;
        acc += v.w * v.w;
    }
    // handle remaining elements
    for (int tail = vecN * 4 + idx; tail<N; tail+= stride){
        float x = input[tail];
        acc += x * x;
    }
    // reduce within block
    acc = blockReduceSum(acc, sdata);
    if (threadIdx.x == 0){
        atomicAdd(globalsum, acc);
    }
}

// kernel-2
__global__ void NormalizedKernel(const float * __restrict__ input, float * __restrict__ output, float invNorm, int N){
    int vecN = N / 4;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    const float4 *vinput = reinterpret_cast<const float4*>(input);
    float4 *voutput = reinterpret_cast<float4*>(output);

    for (int i = idx; i<vecN; i+=stride){
        float4 v = vinput[i];
        v.x *= invNorm;
        v.y *= invNorm;
        v.z *= invNorm;
        v.w *= invNorm;
        voutput[i] = v;
    }
    for (int tail = vecN * 4 + idx; tail<N; tail+= stride){
        output[tail] = input[tail] * invNorm;
    }
}

void L2Normalize_cuda_kernel(const float *d_input, float *d_output, int N){
    const int blockSizes[] = {128, 256, 512};
    int numTests = 3;

    float best_time = 1e9;
    int best_block = 256;

    printf("Tuning L2 normalization on N = %d (%.2f M elements)\n\n", N, N/1e6f);
    printf("%-8s %8s %12s %12s\n", "Block", "Grid", "Time [ms]", "GB/s");

    for (int t=0; t<numTests; t++){
        int threadsPerBlock = blockSizes[t];

        int min_grid = (N + threadsPerBlock*4 - 1) / (threadsPerBlock * 4);
        int grid = max(min_grid, 1);
        grid = min(grid, 65535);

        float *d_sum = nullptr;
        CUDA_CHECK(cudaMalloc(&d_sum, sizeof(float)));

        int warps_per_block = (threadsPerBlock + 31) / 32;
        size_t bytes = warps_per_block * sizeof(float);

        cudaEvent_t start, stop;
        CUDA_CHECK(cudaEventCreate(&start));
        CUDA_CHECK(cudaEventCreate(&stop));

        // warm-up
        CUDA_CHECK(cudaMemset(d_sum, 0, sizeof(float)));
        L2SquaredKernel<<<grid, threadsPerBlock, bytes>>>(d_input, d_sum, N);
        CUDA_CHECK(cudaDeviceSynchronize());

        CUDA_CHECK(cudaEventRecord(start));
        CUDA_CHECK(cudaMemset(d_sum, 0, sizeof(float)));
        L2SquaredKernel<<<grid, threadsPerBlock, bytes>>>(d_input, d_sum, N);
        CUDA_CHECK(cudaDeviceSynchronize());

        float h_sum = 0.0f;
        CUDA_CHECK(cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost));
        float norm = sqrtf(h_sum);
        float invNorm = (norm > 1e-12f) ? 1.0f / norm : 0.0f;

        NormalizedKernel<<<grid, threadsPerBlock>>>(d_input, d_output, invNorm, N);
        CUDA_CHECK(cudaDeviceSynchronize());

        CUDA_CHECK(cudaEventRecord(stop));
        CUDA_CHECK(cudaEventSynchronize(stop));

        float ms = 0.0f;
        CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

        float gbs = (2.0f * N * sizeof(float)) / (ms * 1e6f); // read + write

        printf("%-8d %8d %12.3f %12.2f\n", threadsPerBlock, grid, ms, gbs);

        if (ms < best_time) {
            best_time = ms;
            best_block  = threadsPerBlock;
        }

        CUDA_CHECK(cudaFree(d_sum));
        CUDA_CHECK(cudaEventDestroy(start));
        CUDA_CHECK(cudaEventDestroy(stop));
    }
    printf("\nBest Block: block size = %d (%.3f ms, %.2f GB/s)\n", best_block, best_time,
           (2.0f*N*sizeof(float)/(best_time*1e6f)));
}

int main() {
    const int N = 1 << 24;
    size_t bytes = N * sizeof(float);

    float *h_in  = (float*)malloc(bytes);
    float *h_out = (float*)malloc(bytes);

    for (int i = 0; i < N; ++i)
        h_in[i] = (float)rand() / RAND_MAX * 2.0f - 1.0f;

    float *d_in = nullptr, *d_out = nullptr;
    CUDA_CHECK(cudaMalloc(&d_in,  bytes));
    CUDA_CHECK(cudaMalloc(&d_out, bytes));
    CUDA_CHECK(cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice));

    L2Normalize_cuda_kernel(d_in, d_out, N);

    // copy back and verify
    CUDA_CHECK(cudaMemcpy(h_out, d_out, bytes, cudaMemcpyDeviceToHost));
    double norm2 = 0.0;
    for (int i = 0; i < N; ++i) norm2 += h_out[i] * h_out[i];
    printf("\nFinal output L2 norm = %.9f (should be ≈1.0)\n", sqrt(norm2));

    // Cleanup
    CUDA_CHECK(cudaFree(d_in));
    CUDA_CHECK(cudaFree(d_out));
    free(h_in); free(h_out);

    return 0;
}

Writing OptmizedL2Norm.cu


In [None]:
!nvcc -arch=sm_75 OptmizedL2Norm.cu -o OptmizedL2Norm

In [None]:
!./OptmizedL2Norm

Tuning L2 normalization on N = 16777216 (16.78 M elements)

Block        Grid    Time [ms]         GB/s
128         32768        0.885       151.65
256         16384        0.878       152.90
512          8192        0.890       150.89

Best Block: block size = 256 (0.878 ms, 152.90 GB/s)

Final output L2 norm = 1.000001015 (should be ≈1.0)
