In [1]:

!nvcc --version
!pip install git+https://github.com/afnan47/cuda.git
%load_ext nvcc_plugin


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-4any113t
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-4any113t
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=7ca17e541c4a029b45cd484b77537c1e5f23c131deb79f78e5f877617bba9385
  Stored in directory: /tmp/pip-ephem-wheel-cache-3p7teppe/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e

In [11]:
#include <iostream>
#include <vector>
#include <cuda_runtime.h>

// CUDA kernel for matrix multiplication
__global__ void matrixMultiplication(const int *a, const int *b, int *c, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int i = 0; i < N; ++i) {
            sum += a[row * N + i] * b[i * N + col];
        }
        c[row * N + col] = sum;
        printf("Thread (%d, %d) is computing c[%d][%d]\n", threadIdx.y, threadIdx.x, row, col);
    }
}

int main() {
    int N;
    std::cout << "Enter the size of the square matrices: ";
    std::cin >> N;

    // Allocate memory for matrices on host
    std::vector<int> host_a(N * N);
    std::vector<int> host_b(N * N);
    std::vector<int> host_c(N * N);

    // Input elements of matrices A and B
    std::cout << "Enter elements of matrix A (" << N << "x" << N << "):" << std::endl;
    for (int i = 0; i < N * N; ++i) {
        std::cin >> host_a[i];
    }

    std::cout << "Enter elements of matrix B (" << N << "x" << N << "):" << std::endl;
    for (int i = 0; i < N * N; ++i) {
        std::cin >> host_b[i];
    }

    // Allocate memory for matrices on device
    int *device_a, *device_b, *device_c;
    cudaMalloc(&device_a, N * N * sizeof(int));
    cudaMalloc(&device_b, N * N * sizeof(int));
    cudaMalloc(&device_c, N * N * sizeof(int));

    // Copy matrices from host to device
    cudaMemcpy(device_a, host_a.data(), N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_b, host_b.data(), N * N * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);

    // Launch kernel
    matrixMultiplication<<<gridDim, blockDim>>>(device_a, device_b, device_c, N);

    // Copy result matrix from device to host
    cudaMemcpy(host_c.data(), device_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Print result matrix
    std::cout << "Result matrix (" << N << "x" << N << "):" << std::endl;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << host_c[i * N + j] << " ";
        }
        std::cout << std::endl;
    }

    // Free device memory
    cudaFree(device_a);
    cudaFree(device_b);
    cudaFree(device_c);

    return 0;
}


Writing mul.cu


In [12]:
!nvcc mul.cu -o mul
!./mul

Enter the size of the square matrices: 3 
Enter elements of matrix A (3x3):
1 2 3 
5 6 3
2 3 4
Enter elements of matrix B (3x3):
2 3 4
5 6 7
2 3 4
Thread (2, 0) is computing c[2][0]
Thread (2, 1) is computing c[2][1]
Thread (2, 2) is computing c[2][2]
Thread (0, 0) is computing c[0][0]
Thread (0, 1) is computing c[0][1]
Thread (0, 2) is computing c[0][2]
Thread (1, 0) is computing c[1][0]
Thread (1, 1) is computing c[1][1]
Thread (1, 2) is computing c[1][2]
Result matrix (3x3):
18 24 30 
46 60 74 
27 36 45 
