In [None]:
%%writefile cuda_sqrt.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>

// Kernel function to compute square root of each element
__global__ void vectorSqrt(float *A, float *C, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
    {
        C[i] = sqrtf(A[i]);
    }
}

int main(int argc, char *argv[])
{
    // Get array size from command line argument
    if (argc != 2)
    {
        printf("Usage: %s <array_size>\n", argv[0]);
        return 1;
    }

    int n = atoi(argv[1]);
    size_t size = n * sizeof(float);

    // Allocate input vectors h_A and output vector h_C in host memory
    float *h_A = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    // Initialize input vector h_A
    for (int i = 0; i < n; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX * 100.0f; // Random values between 0 and 100
    }

    // Allocate vectors in device memory
    float *d_A, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_C, size);

    // Copy vector h_A from host memory to device memory d_A
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    // Set up timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Define grid and block dimensions
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // Start timer
    cudaEventRecord(start);

    // Launch kernel on GPU
    vectorSqrt<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, n);

    // Stop timer
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result from device memory to host memory
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify result (check a few values)
    int checkLimit = n < 10 ? n : 10;
    printf("Verifying first %d results:\n", checkLimit);
    for (int i = 0; i < checkLimit; i++)
    {
        float expected = sqrtf(h_A[i]);
        float diff = fabs(h_C[i] - expected);
        if (diff > 1e-5)
        {
            printf("Verification failed at element %d: CPU=%f, GPU=%f\n",
                   i, expected, h_C[i]);
        }
        else
        {
            printf("Element %d: CPU=%f, GPU=%f\n", i, expected, h_C[i]);
        }
    }

    printf("\nArray size: %d\n", n);
    printf("Execution time: %f milliseconds\n", milliseconds);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
    free(h_C);

    return 0;
}



Writing cuda_sqrt.cu


In [None]:
# Compile the code
!nvcc  --gpu-architecture=sm_70  -o cuda_sqrt cuda_sqrt.cu

# Run for different array sizes and collect results
array_sizes = [50000, 500000, 5000000, 50000000]
results = []

for size in array_sizes:
    print(f"\nRunning with array size: {size}")
    !./cuda_sqrt {size}


Running with array size: 50000
Verifying first 10 results:
Element 0: CPU=9.166176, GPU=9.166176
Element 1: CPU=6.279992, GPU=6.279992
Element 2: CPU=8.849289, GPU=8.849289
Element 3: CPU=8.935547, GPU=8.935547
Element 4: CPU=9.548023, GPU=9.548023
Element 5: CPU=4.444675, GPU=4.444675
Element 6: CPU=5.789842, GPU=5.789842
Element 7: CPU=8.764871, GPU=8.764871
Element 8: CPU=5.270434, GPU=5.270434
Element 9: CPU=7.442916, GPU=7.442916

Array size: 50000
Execution time: 0.126272 milliseconds

Running with array size: 500000
Verifying first 10 results:
Element 0: CPU=9.166176, GPU=9.166176
Element 1: CPU=6.279992, GPU=6.279992
Element 2: CPU=8.849289, GPU=8.849289
Element 3: CPU=8.935547, GPU=8.935547
Element 4: CPU=9.548023, GPU=9.548023
Element 5: CPU=4.444675, GPU=4.444675
Element 6: CPU=5.789842, GPU=5.789842
Element 7: CPU=8.764871, GPU=8.764871
Element 8: CPU=5.270434, GPU=5.270434
Element 9: CPU=7.442916, GPU=7.442916

Array size: 500000
Execution time: 0.089376 milliseconds

Run