In [None]:
!nvidia-smi

Tue Jun 17 04:50:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   49C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile matMult.cu
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>

#define N 2000

// CUDA kernel for matrix multiplication
__global__ void matrixMulCUDA(float* A, float* B, float* C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(row < width && col < width) {
        float sum = 0.0f;
        for(int k = 0; k < width; ++k)
            sum += A[row * width + k] * B[k * width + col];
        C[row * width + col] = sum;
    }
}

// Sequential matrix multiplication
void matrixMulCPU(const float* A, const float* B, float* C, int width) {
    for(int i = 0; i < width; ++i) {
        for(int j = 0; j < width; ++j) {
            float sum = 0.0f;
            for(int k = 0; k < width; ++k)
                sum += A[i * width + k] * B[k * width + j];
            C[i * width + j] = sum;
        }
    }
}

int main() {
    size_t bytes = N * N * sizeof(float);

    // Allocate host memory
    float* h_A = (float*)malloc(bytes);
    float* h_B = (float*)malloc(bytes);
    float* h_C_cpu = (float*)malloc(bytes);
    float* h_C_gpu = (float*)malloc(bytes);

    // Initialize matrices
    for(int i = 0; i < N * N; ++i) {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }

    // ----------- CPU Execution ------------
    std::cout << "Running CPU matrix multiplication..." << std::endl;
    auto start_cpu = std::chrono::high_resolution_clock::now();
    matrixMulCPU(h_A, h_B, h_C_cpu, N);
    auto end_cpu = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> cpu_time = end_cpu - start_cpu;
    std::cout << "CPU time: " << cpu_time.count() << " seconds\n";

    // ----------- GPU Execution ------------
    std::cout << "Running GPU matrix multiplication..." << std::endl;

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    // Copy input data to device
    cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

    // CUDA time measurement
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + 15) / 16, (N + 15) / 16);
    matrixMulCUDA<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);

    // Wait and calculate elapsed time
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result back to host
    cudaMemcpy(h_C_gpu, d_C, bytes, cudaMemcpyDeviceToHost);
    std::cout << "GPU time: " << milliseconds / 1000.0 << " seconds\n";

    // ----------- Verification ------------
    bool correct = true;
    for(int i = 0; i < N * N; ++i) {
        if (abs(h_C_cpu[i] - h_C_gpu[i]) > 1e-3) {
            correct = false;
            std::cout << "Mismatch at index " << i << ": CPU=" << h_C_cpu[i] << ", GPU=" << h_C_gpu[i] << "\n";
            break;
        }
    }
    std::cout << (correct ? "Verification PASSED\n" : "Verification FAILED\n");

    // Cleanup
    free(h_A); free(h_B); free(h_C_cpu); free(h_C_gpu);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    return 0;
}



Overwriting matMult.cu


In [None]:
!nvcc -arch=sm_75 matMult.cu -o hello

In [None]:
%%time
!./hello

Running CPU matrix multiplication...
CPU time: 66.7878 seconds
Running GPU matrix multiplication...
GPU time: 0.0588552 seconds
Verification PASSED
CPU times: user 256 ms, sys: 47.2 ms, total: 304 ms
Wall time: 1min 7s
