<a href="https://colab.research.google.com/github/Ayon150/Parallel_Processing/blob/main/Matrix_cu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sat Feb  7 03:26:15 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%%writefile matrix.cu
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void matrixMul(float *A, float *B, float *R, int M, int N, int P, int batchOffset) {
  int k = threadIdx.x + batchOffset;
  float *a = A + k * M * N;
  float *b = B + k * N * P;
  float *r = R + k * M * P;

  for(int outer = 0; outer < 100; outer++) {
    for(int i = 0; i < M; i++) {
      for(int l = 0; l < P; l++) {
        r[i * P + l] = 0.0f;
        for(int j = 0; j < N; j++) {
          r[i * P + l] += a[i * N + j] * b[j * P + l];
        }
      }
    }
  }
}

void printMatrix(float *A, int M, int N) {
  for(int i = 0; i < M; i++) {
    for(int j = 0; j < N; j++) {
      printf("%.0f ", A[i * N + j]);
    }
    cout << endl;
  }
}

int main(int argc, char* argv[]) {
  int threads = atoi(argv[1]);
  int K = atoi(argv[2]);
  int M = atoi(argv[3]);
  int N = atoi(argv[4]);
  int P = atoi(argv[5]);

  int size_of_a = K * M * N;
  int size_of_b = K * N * P;
  int size_of_r = K * M * P;

  float *h_A = (float*)malloc(size_of_a * sizeof(float));
  float *h_B = (float*)malloc(size_of_b * sizeof(float));
  float *h_R = (float*)malloc(size_of_r * sizeof(float));

  for(int i = 0; i < size_of_a; i++) h_A[i] = rand() % 10;
  for(int i = 0; i < size_of_b; i++) h_B[i] = rand() % 10;

  float *d_A, *d_B, *d_R;
  cudaMalloc(&d_A, size_of_a * sizeof(float));
  cudaMalloc(&d_B, size_of_b * sizeof(float));
  cudaMalloc(&d_R, size_of_r * sizeof(float));

  cudaMemcpy(d_A, h_A, size_of_a * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size_of_b * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemset(d_R, 0, size_of_r * sizeof(float));

  int remainingMatrices = K;
  int batchOffset = 0;

  while(remainingMatrices > 0) {
    int currentBatchSize = min(remainingMatrices, threads);
    matrixMul<<<1, currentBatchSize>>>(d_A, d_B, d_R, M, N, P, batchOffset);
    cudaDeviceSynchronize();
    remainingMatrices -= currentBatchSize;
    batchOffset += currentBatchSize;
  }

  cudaMemcpy(h_R, d_R, size_of_r * sizeof(float), cudaMemcpyDeviceToHost);

  cout << "Matrix A[0]:" << endl;
  printMatrix(h_A, M, N);
  cout << "Matrix B[0]:" << endl;
  printMatrix(h_B, N, P);
  cout << "Matrix R[0]:" << endl;
  printMatrix(h_R, M, P);

  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_R);
  free(h_A);
  free(h_B);
  free(h_R);
  return 0;
}


Writing matrix.cu


In [4]:
!nvcc matrix.cu -o matrix


In [5]:
!./matrix 400 20 20 20 20


Matrix A[0]:
3 6 7 5 3 5 6 2 9 1 2 7 0 9 3 6 0 6 2 6 
1 8 7 9 2 0 2 3 7 5 9 2 2 8 9 7 3 6 1 2 
9 3 1 9 4 7 8 4 5 0 3 6 1 0 6 3 2 0 6 1 
5 5 4 7 6 5 6 9 3 7 4 5 2 5 4 7 4 4 3 0 
7 8 6 8 8 4 3 1 4 9 2 0 6 8 9 2 6 6 4 9 
5 0 4 8 7 1 7 2 7 2 2 6 1 0 6 1 5 9 4 9 
0 9 1 7 7 1 1 5 9 7 7 6 7 3 6 5 6 3 9 4 
8 1 2 9 3 9 0 8 8 5 0 9 6 3 8 5 6 1 1 5 
9 8 4 8 1 0 3 0 4 4 4 4 7 6 3 1 7 5 9 6 
2 1 7 8 5 7 4 1 8 5 9 7 5 3 8 8 3 1 8 9 
6 4 3 3 3 8 6 0 4 8 8 8 9 7 7 6 4 3 0 3 
0 9 2 5 4 0 5 9 4 6 9 2 2 4 7 7 5 4 8 1 
2 8 9 3 6 8 0 2 1 0 5 1 1 0 8 5 0 6 4 6 
2 5 8 6 2 8 4 7 2 4 0 6 2 9 9 0 8 1 3 1 
1 0 3 4 0 3 9 1 9 6 9 3 3 8 0 5 6 6 4 0 
0 4 6 2 6 7 5 6 9 8 7 2 8 2 9 9 6 0 2 7 
6 1 3 2 1 5 9 9 1 4 9 1 0 7 5 8 7 0 4 8 
0 4 2 9 6 1 0 4 2 2 2 0 5 5 2 9 0 2 8 3 
8 0 4 0 9 1 9 6 2 5 4 4 9 9 3 6 0 5 0 2 
9 4 3 5 1 7 4 3 1 4 6 9 4 2 2 6 4 1 2 8 
Matrix B[0]:
2 1 9 0 1 2 8 3 2 9 8 3 4 8 7 4 3 7 8 4 
3 7 4 1 1 9 6 5 3 4 9 6 5 1 6 6 3 6 9 7 
7 7 3 1 8 0 5 1 9 6 8 2 3 4 6 6 3 2 1 7 
6 3 3 1 4 1 7 9 9 8 6 6 6 9 0 6

In [6]:
!time ./matrix 400 20 20 20 20

Matrix A[0]:
3 6 7 5 3 5 6 2 9 1 2 7 0 9 3 6 0 6 2 6 
1 8 7 9 2 0 2 3 7 5 9 2 2 8 9 7 3 6 1 2 
9 3 1 9 4 7 8 4 5 0 3 6 1 0 6 3 2 0 6 1 
5 5 4 7 6 5 6 9 3 7 4 5 2 5 4 7 4 4 3 0 
7 8 6 8 8 4 3 1 4 9 2 0 6 8 9 2 6 6 4 9 
5 0 4 8 7 1 7 2 7 2 2 6 1 0 6 1 5 9 4 9 
0 9 1 7 7 1 1 5 9 7 7 6 7 3 6 5 6 3 9 4 
8 1 2 9 3 9 0 8 8 5 0 9 6 3 8 5 6 1 1 5 
9 8 4 8 1 0 3 0 4 4 4 4 7 6 3 1 7 5 9 6 
2 1 7 8 5 7 4 1 8 5 9 7 5 3 8 8 3 1 8 9 
6 4 3 3 3 8 6 0 4 8 8 8 9 7 7 6 4 3 0 3 
0 9 2 5 4 0 5 9 4 6 9 2 2 4 7 7 5 4 8 1 
2 8 9 3 6 8 0 2 1 0 5 1 1 0 8 5 0 6 4 6 
2 5 8 6 2 8 4 7 2 4 0 6 2 9 9 0 8 1 3 1 
1 0 3 4 0 3 9 1 9 6 9 3 3 8 0 5 6 6 4 0 
0 4 6 2 6 7 5 6 9 8 7 2 8 2 9 9 6 0 2 7 
6 1 3 2 1 5 9 9 1 4 9 1 0 7 5 8 7 0 4 8 
0 4 2 9 6 1 0 4 2 2 2 0 5 5 2 9 0 2 8 3 
8 0 4 0 9 1 9 6 2 5 4 4 9 9 3 6 0 5 0 2 
9 4 3 5 1 7 4 3 1 4 6 9 4 2 2 6 4 1 2 8 
Matrix B[0]:
2 1 9 0 1 2 8 3 2 9 8 3 4 8 7 4 3 7 8 4 
3 7 4 1 1 9 6 5 3 4 9 6 5 1 6 6 3 6 9 7 
7 7 3 1 8 0 5 1 9 6 8 2 3 4 6 6 3 2 1 7 
6 3 3 1 4 1 7 9 9 8 6 6 6 9 0 6