In [1]:
!pip install virtualenv

Collecting virtualenv
  Downloading virtualenv-20.25.0-py3-none-any.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting distlib<1,>=0.3.7 (from virtualenv)
  Downloading distlib-0.3.8-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.9/468.9 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: distlib, virtualenv
Successfully installed distlib-0.3.8 virtualenv-20.25.0


In [2]:
!virtualenv /content/cuda_env

created virtual environment CPython3.10.12.final.0-64 in 773ms
  creator CPython3Posix(dest=/content/cuda_env, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==23.3.1, setuptools==69.0.2, wheel==0.42.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator


In [3]:
!source /content/cuda_env/bin/activate;

In [4]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-9wm7nqsq
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-9wm7nqsq
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=be4d9e2bb27c7673f87e4f32bb1221f268450edeb529285baf6f0c3f1f8cfcd6
  Stored in directory: /tmp/pip-ephem-wheel-cache-rbmb2bvz/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content

#**TEST CASE 1**

In [5]:
%%cu
#include <iostream>
#include <iomanip>
#include <cuda_runtime.h>
#include <chrono>

const int N = 3;

// Kernel using shared memory
__global__ void matrixMulShared(const int *A, const int *B, int *C, int n) {
    __shared__ int tileA[N][N];
    __shared__ int tileB[N][N];

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int result = 0;

    // Loop over tiles
    for (int t = 0; t < n; t += blockDim.x) {
        // Load tiles into shared memory
        tileA[threadIdx.y][threadIdx.x] = (row < n && t + threadIdx.x < n) ? A[row * n + t + threadIdx.x] : 0;
        tileB[threadIdx.y][threadIdx.x] = (col < n && t + threadIdx.y < n) ? B[(t + threadIdx.y) * n + col] : 0;

        __syncthreads();

        // Compute partial result from the tiles in shared memory
        for (int k = 0; k < blockDim.x; ++k) {
            result += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }

        __syncthreads();
    }

    // Write the result to the output matrix
    if (row < n && col < n) {
        C[row * n + col] = result;
    }
}

// Kernel using global memory
__global__ void matrixMulGlobal(const int *A, const int *B, int *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int result = 0;

    if (row < n && col < n) {
        for (int k = 0; k < n; ++k) {
            result += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = result;
    }
}

// CPU implementation
void matrixMulCPU(const int *A, const int *B, int *C, int n) {
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            int result = 0;
            for (int k = 0; k < n; ++k) {
                result += A[i * n + k] * B[k * n + j];
            }
            C[i * n + j] = result;
        }
    }
}

// Utility function to print matrices
void printMatrix(const int *matrix, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << std::setw(5) << matrix[i * cols + j] << " ";
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;
}

int main() {
    const int A[N][N] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
    const int B[N][N] = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};

    int C_cpu[N][N];
    int C_gpu_shared[N][N];
    int C_gpu_global[N][N];

    int *d_A, *d_B, *d_C_shared, *d_C_global;
    cudaMalloc((void **)&d_A, N * N * sizeof(int));
    cudaMalloc((void **)&d_B, N * N * sizeof(int));
    cudaMalloc((void **)&d_C_shared, N * N * sizeof(int));
    cudaMalloc((void **)&d_C_global, N * N * sizeof(int));

    cudaMemcpy(d_A, &A[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, &B[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockDim(N, N);
    dim3 gridDim(1, 1);

    // Measure GPU execution time for shared memory
    auto start_gpu_shared = std::chrono::high_resolution_clock::now();
    matrixMulShared<<<gridDim, blockDim>>>(d_A, d_B, d_C_shared, N);
    cudaDeviceSynchronize(); // Wait for GPU to finish
    auto end_gpu_shared = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> gpu_shared_duration = end_gpu_shared - start_gpu_shared;
    std::cout << "GPU (Shared Memory) Execution Time: " << gpu_shared_duration.count() << " seconds" << std::endl;

    // Measure GPU execution time for global memory
    auto start_gpu_global = std::chrono::high_resolution_clock::now();
    matrixMulGlobal<<<gridDim, blockDim>>>(d_A, d_B, d_C_global, N);
    cudaDeviceSynchronize(); // Wait for GPU to finish
    auto end_gpu_global = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> gpu_global_duration = end_gpu_global - start_gpu_global;
    std::cout << "GPU (Global Memory) Execution Time: " << gpu_global_duration.count() << " seconds" << std::endl;

    // Copy results from GPU to host
    cudaMemcpy(&C_gpu_shared[0][0], d_C_shared, N * N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&C_gpu_global[0][0], d_C_global, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Measure CPU execution time
    auto start_cpu = std::chrono::high_resolution_clock::now();
    matrixMulCPU(&A[0][0], &B[0][0], &C_cpu[0][0], N);
    auto end_cpu = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> cpu_duration = end_cpu - start_cpu;
    std::cout << "CPU Execution Time: " << cpu_duration.count() << " seconds" << std::endl;

    // Print matrices A, B, and C_cpu (result from CPU)
    std::cout << "Matrix A:" << std::endl;
    printMatrix(&A[0][0], N, N);

    std::cout << "Matrix B:" << std::endl;
    printMatrix(&B[0][0], N, N);

    std::cout << "Result from CPU (C_cpu):" << std::endl;
    printMatrix(&C_cpu[0][0], N, N);

    // Print the result from the GPU (C_gpu_shared)
    std::cout << "Result from GPU (Shared Memory) (C_gpu_shared):" << std::endl;
    printMatrix(&C_gpu_shared[0][0], N, N);

    // Print the result from the GPU (C_gpu_global)
    std::cout << "Result from GPU (Global Memory) (C_gpu_global):" << std::endl;
    printMatrix(&C_gpu_global[0][0], N, N);

    // Compare results
    bool resultMatch_shared = true;
    bool resultMatch_global = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (C_cpu[i][j] != C_gpu_shared[i][j]) {
                resultMatch_shared = false;
            }
            if (C_cpu[i][j] != C_gpu_global[i][j]) {
                resultMatch_global = false;
            }
        }
    }

    if (resultMatch_shared) {
        std::cout << "Results match between CPU and GPU (Shared Memory) implementations." << std::endl;
    } else {
        std::cout << "Results do not match between CPU and GPU (Shared Memory) implementations." << std::endl;
    }

    if (resultMatch_global) {
        std::cout << "Results match between CPU and GPU (Global Memory) implementations." << std::endl;
    } else {
        std::cout << "Results do not match between CPU and GPU (Global Memory) implementations." << std::endl;
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C_shared);
    cudaFree(d_C_global);

    return 0;
}


GPU (Shared Memory) Execution Time: 0.127965 seconds
GPU (Global Memory) Execution Time: 4.1126e-05 seconds
CPU Execution Time: 5.98e-07 seconds
Matrix A:
    1     2     3 
    4     5     6 
    7     8     9 

Matrix B:
    9     8     7 
    6     5     4 
    3     2     1 

Result from CPU (C_cpu):
   30    24    18 
   84    69    54 
  138   114    90 

Result from GPU (Shared Memory) (C_gpu_shared):
   30    24    18 
   84    69    54 
  138   114    90 

Result from GPU (Global Memory) (C_gpu_global):
   30    24    18 
   84    69    54 
  138   114    90 

Results match between CPU and GPU (Shared Memory) implementations.
Results match between CPU and GPU (Global Memory) implementations.



# **TEST CASE 2**

In [6]:
%%cu
#include <iostream>
#include <iomanip>
#include <cuda_runtime.h>
#include <chrono>

const int N = 3;

// Kernel using shared memory
__global__ void matrixMulShared(const int *A, const int *B, int *C, int n) {
    __shared__ int tileA[N][N];
    __shared__ int tileB[N][N];

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int result = 0;

    // Loop over tiles
    for (int t = 0; t < n; t += blockDim.x) {
        // Load tiles into shared memory
        tileA[threadIdx.y][threadIdx.x] = (row < n && t + threadIdx.x < n) ? A[row * n + t + threadIdx.x] : 0;
        tileB[threadIdx.y][threadIdx.x] = (col < n && t + threadIdx.y < n) ? B[(t + threadIdx.y) * n + col] : 0;

        __syncthreads();

        // Compute partial result from the tiles in shared memory
        for (int k = 0; k < blockDim.x; ++k) {
            result += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }

        __syncthreads();
    }

    // Write the result to the output matrix
    if (row < n && col < n) {
        C[row * n + col] = result;
    }
}

// Kernel using global memory
__global__ void matrixMulGlobal(const int *A, const int *B, int *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int result = 0;

    if (row < n && col < n) {
        for (int k = 0; k < n; ++k) {
            result += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = result;
    }
}

// CPU implementation
void matrixMulCPU(const int *A, const int *B, int *C, int n) {
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            int result = 0;
            for (int k = 0; k < n; ++k) {
                result += A[i * n + k] * B[k * n + j];
            }
            C[i * n + j] = result;
        }
    }
}

// Utility function to print matrices
void printMatrix(const int *matrix, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << std::setw(5) << matrix[i * cols + j] << " ";
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;
}

int main() {
    const int A[N][N] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
    const int B[N][N] = {{9, 15, 12}, {52, -52, 15}, {-21, 52, 21}};

    int C_cpu[N][N];
    int C_gpu_shared[N][N];
    int C_gpu_global[N][N];

    int *d_A, *d_B, *d_C_shared, *d_C_global;
    cudaMalloc((void **)&d_A, N * N * sizeof(int));
    cudaMalloc((void **)&d_B, N * N * sizeof(int));
    cudaMalloc((void **)&d_C_shared, N * N * sizeof(int));
    cudaMalloc((void **)&d_C_global, N * N * sizeof(int));

    cudaMemcpy(d_A, &A[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, &B[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockDim(N, N);
    dim3 gridDim(1, 1);

    // Measure GPU execution time for shared memory
    auto start_gpu_shared = std::chrono::high_resolution_clock::now();
    matrixMulShared<<<gridDim, blockDim>>>(d_A, d_B, d_C_shared, N);
    cudaDeviceSynchronize(); // Wait for GPU to finish
    auto end_gpu_shared = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> gpu_shared_duration = end_gpu_shared - start_gpu_shared;
    std::cout << "GPU (Shared Memory) Execution Time: " << gpu_shared_duration.count() << " seconds" << std::endl;

    // Measure GPU execution time for global memory
    auto start_gpu_global = std::chrono::high_resolution_clock::now();
    matrixMulGlobal<<<gridDim, blockDim>>>(d_A, d_B, d_C_global, N);
    cudaDeviceSynchronize(); // Wait for GPU to finish
    auto end_gpu_global = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> gpu_global_duration = end_gpu_global - start_gpu_global;
    std::cout << "GPU (Global Memory) Execution Time: " << gpu_global_duration.count() << " seconds" << std::endl;

    // Copy results from GPU to host
    cudaMemcpy(&C_gpu_shared[0][0], d_C_shared, N * N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&C_gpu_global[0][0], d_C_global, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Measure CPU execution time
    auto start_cpu = std::chrono::high_resolution_clock::now();
    matrixMulCPU(&A[0][0], &B[0][0], &C_cpu[0][0], N);
    auto end_cpu = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> cpu_duration = end_cpu - start_cpu;
    std::cout << "CPU Execution Time: " << cpu_duration.count() << " seconds" << std::endl;

    // Print matrices A, B, and C_cpu (result from CPU)
    std::cout << "Matrix A:" << std::endl;
    printMatrix(&A[0][0], N, N);

    std::cout << "Matrix B:" << std::endl;
    printMatrix(&B[0][0], N, N);

    std::cout << "Result from CPU (C_cpu):" << std::endl;
    printMatrix(&C_cpu[0][0], N, N);

    // Print the result from the GPU (C_gpu_shared)
    std::cout << "Result from GPU (Shared Memory) (C_gpu_shared):" << std::endl;
    printMatrix(&C_gpu_shared[0][0], N, N);

    // Print the result from the GPU (C_gpu_global)
    std::cout << "Result from GPU (Global Memory) (C_gpu_global):" << std::endl;
    printMatrix(&C_gpu_global[0][0], N, N);

    // Compare results
    bool resultMatch_shared = true;
    bool resultMatch_global = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (C_cpu[i][j] != C_gpu_shared[i][j]) {
                resultMatch_shared = false;
            }
            if (C_cpu[i][j] != C_gpu_global[i][j]) {
                resultMatch_global = false;
            }
        }
    }

    if (resultMatch_shared) {
        std::cout << "Results match between CPU and GPU (Shared Memory) implementations." << std::endl;
    } else {
        std::cout << "Results do not match between CPU and GPU (Shared Memory) implementations." << std::endl;
    }

    if (resultMatch_global) {
        std::cout << "Results match between CPU and GPU (Global Memory) implementations." << std::endl;
    } else {
        std::cout << "Results do not match between CPU and GPU (Global Memory) implementations." << std::endl;
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C_shared);
    cudaFree(d_C_global);

    return 0;
}


GPU (Shared Memory) Execution Time: 0.000233113 seconds
GPU (Global Memory) Execution Time: 3.6068e-05 seconds
CPU Execution Time: 5.52e-07 seconds
Matrix A:
    1     2     3 
    4     5     6 
    7     8     9 

Matrix B:
    9    15    12 
   52   -52    15 
  -21    52    21 

Result from CPU (C_cpu):
   50    67   105 
  170   112   249 
  290   157   393 

Result from GPU (Shared Memory) (C_gpu_shared):
   50    67   105 
  170   112   249 
  290   157   393 

Result from GPU (Global Memory) (C_gpu_global):
   50    67   105 
  170   112   249 
  290   157   393 

Results match between CPU and GPU (Shared Memory) implementations.
Results match between CPU and GPU (Global Memory) implementations.

