Ques3

In [14]:
%%writefile abcd.cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>

__global__ void doubleValue(float *a, float *c, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int idx = row * N + col;

    if (row < M && col < N) {
        c[idx] = a[idx] * 2.0f; // Double the value
        c[idx] = c[idx] * c[idx]; // Square the value
    }
}

int main() {
    int M = 1000000; // Number of rows
    int N = 1; // Number of columns (since we have 1D array)

    // Allocate host memory for arrays a and c
    float *h_a, *h_c;
    h_a = (float *)malloc(M * N * sizeof(float));
    h_c = (float *)malloc(M * N * sizeof(float));
    if (h_a == nullptr || h_c == nullptr) {
        fprintf(stderr, "Failed to allocate host memory!\n");
        return 1;
    }

    // Initialize array a with sample values (replace with your initialization logic)
    for (int i = 0; i < M; i++) {
        h_a[i] = i + 0.5f; // Example initialization
    }

    // Allocate device memory for arrays a and c
    float *d_a, *d_c;
    cudaMalloc((void **)&d_a, M * N * sizeof(float));
    cudaMalloc((void **)&d_c, M * N * sizeof(float));
    if (d_a == nullptr || d_c == nullptr) {
        fprintf(stderr, "Failed to allocate device memory!\n");
        cudaFree(h_a);
        cudaFree(h_c);
        return 1;
    }

    // Copy data from host to device memory
    cudaMemcpy(d_a, h_a, M * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize(); // Ensure data transfer is complete

    // Define kernel launch parameters
    int threadsPerBlock = 256; // Adjust based on your GPU architecture
    dim3 blocksInGrid(ceil(float(M) / threadsPerBlock));

    // Start timing
    clock_t start = clock();

    // Launch the kernel
    doubleValue<<<blocksInGrid, threadsPerBlock>>>(d_a, d_c, M, N);

    // Wait for kernel execution to finish
    cudaDeviceSynchronize();

    // Stop timing
    clock_t end = clock();
    double time_taken = (double)(end - start) / CLOCKS_PER_SEC;

    printf("Time taken: %f seconds\n", time_taken);

    // Copy results from device to host memory
    cudaMemcpy(h_c, d_c, M * N * sizeof(float), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_c);

    // Optional: Print the results on the host (if h_c was allocated)
    // for (int i = 0; i < M; i++) {
    //     printf("%f ", h_c[i]);
    //     printf("\n");
    // }

    return 0;
}

Writing abcd.cu


In [15]:
!nvcc -o abcd abcd.cu

In [18]:
!./abcd

Time taken: 0.059071 seconds
