In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Kaggle". Running its setup...
Updating the package lists...
Installing nvidia-cuda-toolkit, this may take a few minutes...
Source files will be saved in "/tmp/tmpeg1fzzos".


In [10]:
%%writefile StrassenCUDAcppImplementation.cu
#include <vector>
#include <iostream>

// Implementation of strassen's algorithm for 2x2 matricies
__global__ void strassen(const int* mat1, const int* mat2, int* result, int rowsIdx, int colsIdx, int rows, int cols) {
    int M_1, M_2, M_3, M_4, M_5, M_6, M_7;
    int C_11, C_12, C_21, C_22;

    // M1 = (A11 + A22)(B11 + B22)
    M_1 = (mat1[rowsIdx + colsIdx*rows] + mat1[(rowsIdx+1) + (colsIdx+1)*rows]) * (mat2[rowsIdx + colsIdx*rows] + mat2[(rowsIdx+1) + (colsIdx+1)*rows]);
    // M2 = (A21 + A22) B11
    M_2 = (mat1[(rowsIdx+1) + colsIdx*rows] + mat1[(rowsIdx+1) + (colsIdx+1)*rows]) * mat2[rowsIdx + colsIdx*rows];
    // M3 = A11 (B12 - B22)
    M_3 = mat1[rowsIdx + colsIdx*rows] * (mat2[rowsIdx + (colsIdx+1)*rows] - mat2[(rowsIdx+1) + (colsIdx+1)*rows]);
    // M4 = A22 (B21 - B11)
    M_4 = mat1[(rowsIdx+1) + (colsIdx+1)*rows] * (mat2[(rowsIdx+1) + colsIdx*rows] - mat2[rowsIdx + colsIdx*rows]);
    // M5 = (A11 + A12) B22
    M_5 = (mat1[rowsIdx + colsIdx*rows] + mat1[rowsIdx + (colsIdx+1)*rows]) * mat2[(rowsIdx+1) + (colsIdx+1)*rows];
    // M6 = (A21 - A11) (B11 + B12)
    M_6 = (mat1[(rowsIdx+1) + colsIdx*rows] - mat1[rowsIdx + colsIdx*rows]) * (mat2[rowsIdx + colsIdx*rows] + mat2[rowsIdx + (colsIdx+1)*rows]);
    // M7 = (A12 - A22) (B21 + B22)
    M_7 = (mat1[rowsIdx + (colsIdx+1)*rows] - mat1[(rowsIdx+1) + (colsIdx+1)*rows]) * (mat2[(rowsIdx+1) + colsIdx*rows] + mat2[(rowsIdx+1) + (colsIdx+1)*rows]);

    C_11 = M_1 + M_4 - M_5 + M_7;
    C_12 = M_3 + M_5;
    C_21 = M_2 + M_4;
    C_22 = M_1 - M_2 + M_3 + M_6;

    result[rowsIdx + 0*rows + colsIdx] = C_11;
    result[rowsIdx + 1*rows + colsIdx] = C_12;
    result[rowsIdx + 0*rows + colsIdx + 1] = C_21;
    result[rowsIdx + 1*rows + colsIdx + 1] = C_22;
}

//__global__ void printMatrix(std::vector<int> mat1, int rowsIdx, int colsIdx, int rows, int cols){
    //for(int i = rowsIdx; i<rows + colsIdx; i++){
       // for(int j = colsIdx; j<cols + colsIdx; j++){
       //     printf("%d ", mat1[j + i * rows]);
     //   }
   //     printf("\n");
 //   }
//    printf("\n");
//}
//


int main(){
    int rows = 2;
    int cols = 2;
    // Defining our vectors
    std::vector<int> mat1 = {1,2};

    std::vector<int> mat2 = {1,2};
    std::vector<int> result = {0,0};

    int *d_mat1, *d_mat2, *d_result;
    cudaMalloc(&d_mat1, (rows*cols)*sizeof(int));
    cudaMalloc(&d_mat2, (rows*cols)*sizeof(int));
    cudaMalloc(&d_result, (rows*cols)*sizeof(int));


    // Copying the memory from cpu to gpu
    cudaMemcpy(d_mat1, mat1.data() ,(rows*cols)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_mat2, mat2.data(),(rows*cols)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, result.data(),(rows*cols)*sizeof(int), cudaMemcpyHostToDevice);

    for(int i = 0; i < rows; i += 2){
        for(int j = 0; j < cols; j += 2){
            strassen<<<1,1>>>(d_mat1, d_mat2, d_result, i, j, rows, cols);
        }
    }
    cudaMemcpy(result.data(), d_result, rows*cols*sizeof(int), cudaMemcpyDeviceToHost);

    for(int i = 0; i < rows; i += 1){
        for(int j = 0; j < cols; j += 1){
            std::cout << result[i + j*rows] <<' ';
        }
        std::cout << '\n';
    }

    return 0;
}



Overwriting StrassenCUDAcppImplementation.cu


In [11]:
!nvcc StrassenCUDAcppImplementation.cu -o StrassenCUDAcppImplementation
!./StrassenCUDAcppImplementation

1 0 
2 0 


!nvcc StrassenCUDAcppImplementation.cu -o StrassenCUDAcppImplementation
!./StrassenCUDAcppImplementation