<a href="https://colab.research.google.com/github/Abdallah-RA/CUDA-Course/blob/main/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!nvidia-smi
!nvcc --version

Mon Mar 31 08:21:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
%%writefile hello.cu
#include <iostream>
#include <cstdio>         // for printf in the kernel
#include <cuda_runtime.h> // for cuda functions

// Simple kernel that prints from the GPU
__global__ void helloFromGPU() {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        printf("Hello World from GPU!\n");
    }
}

int main() {
    // Launch the kernel (1 block, 1 thread)
    helloFromGPU<<<1, 1>>>();

    // Check for any errors during kernel launch
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Kernel launch error: "
                  << cudaGetErrorString(err) << std::endl;
        return 1;
    }

    // Wait for GPU to finish, ensures the kernel's printf completes
    cudaDeviceSynchronize();

    // Print from the CPU
    std::cout << "Hello from CPU!" << std::endl;

    return 0;
}


Writing hello.cu


In [7]:
!nvcc -arch=sm_75 hello.cu -o hello

In [8]:
!./hello


Hello World from GPU!
Hello from CPU!


In [17]:
%%writefile Code1.cu
#include <iostream>
#include <cstdio>
#include <cuda_runtime.h>

__host__ int hostOnlyFunction(int x) {
    return x * 2;
}

__device__ int deviceOnlyFunction(int x) {
    return x + 1;
}


__global__ void computeOnGPU(int* dArray, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx < N) {
        dArray[idx] = deviceOnlyFunction(idx);
    }
}

int main() {
    cudaError_t err;

    const int N = 5;
    int hArray[N];      // host array
    int *dArray = nullptr;  // device pointer

    cudaMalloc((void**)&dArray, N * sizeof(int));

    computeOnGPU<<<1, N>>>(dArray, N);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Kernel (computeOnGPU) launch error: "
                  << cudaGetErrorString(err) << std::endl;
        return 1;
    }
    cudaDeviceSynchronize();

    cudaMemcpy(hArray, dArray, N * sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "\nResults from computeOnGPU (with deviceOnlyFunction):\n";
    for (int i = 0; i < N; i++) {
        int doubledVal = hostOnlyFunction(hArray[i]); // __host__ function
        std::cout << "Index " << i
                  << " => GPU value: " << hArray[i]
                  << ", hostOnlyFunction result: " << doubledVal
                  << std::endl;
    }

    // Free device memory
    cudaFree(dArray);

    return 0;
}


Overwriting Code1.cu


In [14]:
!nvcc -arch=sm_75 Code1.cu -o Code1
!./Code1


Hello from CPU!

Results from computeOnGPU (with deviceOnlyFunction):
Index 0 => GPU value: 1, hostOnlyFunction result: 2
Index 1 => GPU value: 2, hostOnlyFunction result: 4
Index 2 => GPU value: 3, hostOnlyFunction result: 6
Index 3 => GPU value: 4, hostOnlyFunction result: 8
Index 4 => GPU value: 5, hostOnlyFunction result: 10


In [18]:
%%writefile multiDimKernels.cu
#include <iostream>
#include <cstdio>
#include <cuda_runtime.h>

// ------------------------------
// 1D Kernel
// ------------------------------
__global__ void fillArray1D(int* dArr, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        // Just store the thread index in the array
        dArr[idx] = idx;
    }
}

// ------------------------------
// 2D Kernel
// ------------------------------
__global__ void fillArray2D(int* dArr, int width, int height) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Check boundary
    if (row < height && col < width) {
        // Flatten 2D (row, col) into a 1D index
        int idx = row * width + col;
        // Example: store row*100 + col
        dArr[idx] = row * 100 + col;
    }
}

// ------------------------------
// 3D Kernel
// ------------------------------
__global__ void fillArray3D(int* dArr, int width, int height, int depth) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;

    if (x < width && y < height && z < depth) {
        // Flatten 3D (x, y, z) into 1D index: z*(width*height) + y*width + x
        int idx = z * (width * height) + y * width + x;
        // Example: store x + 10*y + 100*z
        dArr[idx] = x + 10 * y + 100 * z;
    }
}

int main() {
    // --------------------------------
    // 1) 1D Example
    // --------------------------------
    {
        std::cout << "=== 1D Example ===\n";
        const int N = 8;
        int size1D = N * sizeof(int);

        // Allocate host memory
        int* hArr1D = (int*)malloc(size1D);

        // Allocate device memory
        int* dArr1D;
        cudaMalloc(&dArr1D, size1D);

        // Calculate block / grid
        int threads = 4;          // let's choose 4 threads
        int blocks  = (N + threads - 1) / threads;

        // Launch kernel
        fillArray1D<<<blocks, threads>>>(dArr1D, N);
        cudaDeviceSynchronize();

        // Copy back
        cudaMemcpy(hArr1D, dArr1D, size1D, cudaMemcpyDeviceToHost);

        // Print entire 1D array
        std::cout << "1D array contents:\n";
        for (int i = 0; i < N; i++) {
            std::cout << "hArr1D[" << i << "] = " << hArr1D[i] << "\n";
        }

        // Cleanup
        free(hArr1D);
        cudaFree(dArr1D);
    }

    // --------------------------------
    // 2) 2D Example
    // --------------------------------
    {
        std::cout << "\n=== 2D Example ===\n";
        int width  = 4;
        int height = 3;  // total = 12 elements
        int size2D = width * height * sizeof(int);

        int* hArr2D = (int*)malloc(size2D);
        int* dArr2D;
        cudaMalloc(&dArr2D, size2D);

        // 2D thread-block, 2D grid
        dim3 block2D(2, 2);  // 2x2 threads
        dim3 grid2D( (width  + block2D.x - 1)/block2D.x,
                     (height + block2D.y - 1)/block2D.y );

        // Launch 2D kernel
        fillArray2D<<<grid2D, block2D>>>(dArr2D, width, height);
        cudaDeviceSynchronize();

        // Copy results
        cudaMemcpy(hArr2D, dArr2D, size2D, cudaMemcpyDeviceToHost);

        // Print the entire 2D array in row-major format
        std::cout << "2D array contents (row-major):\n";
        for (int r = 0; r < height; r++) {
            for (int c = 0; c < width; c++) {
                int idx = r * width + c;
                std::cout << hArr2D[idx] << "\t";
            }
            std::cout << "\n";
        }

        // Cleanup
        free(hArr2D);
        cudaFree(dArr2D);
    }

    // --------------------------------
    // 3) 3D Example
    // --------------------------------
    {
        std::cout << "\n=== 3D Example ===\n";
        int width  = 3;
        int height = 2;
        int depth  = 2;  // total = 3*2*2 = 12 elements
        int size3D = width * height * depth * sizeof(int);

        int* hArr3D = (int*)malloc(size3D);
        int* dArr3D;
        cudaMalloc(&dArr3D, size3D);

        // 3D block / grid
        dim3 block3D(2, 2, 1);
        dim3 grid3D( (width  + block3D.x - 1)/block3D.x,
                     (height + block3D.y - 1)/block3D.y,
                     (depth  + block3D.z - 1)/block3D.z );

        // Launch 3D kernel
        fillArray3D<<<grid3D, block3D>>>(dArr3D, width, height, depth);
        cudaDeviceSynchronize();

        // Copy back
        cudaMemcpy(hArr3D, dArr3D, size3D, cudaMemcpyDeviceToHost);

        // Print 3D data layer by layer
        std::cout << "3D array contents (z-layers, row-major within each layer):\n";
        for (int z = 0; z < depth; z++) {
            std::cout << "Layer z=" << z << ":\n";
            for (int y = 0; y < height; y++) {
                for (int x = 0; x < width; x++) {
                    int idx = z * (width * height) + y * width + x;
                    std::cout << hArr3D[idx] << "\t";
                }
                std::cout << "\n";
            }
            std::cout << "\n";
        }

        // Cleanup
        free(hArr3D);
        cudaFree(dArr3D);
    }

    std::cout << "\nDone.\n";
    return 0;
}


Writing multiDimKernels.cu


In [19]:
!nvcc -arch=sm_75 multiDimKernels.cu -o multiDim
!./multiDim


=== 1D Example ===
1D array contents:
hArr1D[0] = 0
hArr1D[1] = 1
hArr1D[2] = 2
hArr1D[3] = 3
hArr1D[4] = 4
hArr1D[5] = 5
hArr1D[6] = 6
hArr1D[7] = 7

=== 2D Example ===
2D array contents (row-major):
0	1	2	3	
100	101	102	103	
200	201	202	203	

=== 3D Example ===
3D array contents (z-layers, row-major within each layer):
Layer z=0:
0	1	2	
10	11	12	

Layer z=1:
100	101	102	
110	111	112	


Done.
