In [23]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


A kernel for memory efficient matrix transpose using shared memory to ensure data reads and data writes are memory coalesced.

In [24]:
%%writefile trans.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16  // Optimized for shared memory access

__global__ void matrixTransposeShared(float *d_in, float *d_out, int width, int height) {
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE + 1];  // Avoid shared memory bank conflicts

    int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    // Load into shared memory (check bounds)
    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = d_in[y * width + x];
    }
    __syncthreads();

    // Transpose within shared memory and write back to global memory
    int transposed_x = blockIdx.y * BLOCK_SIZE + threadIdx.x;
    int transposed_y = blockIdx.x * BLOCK_SIZE + threadIdx.y;

    if (transposed_x < height && transposed_y < width) {
        d_out[transposed_y * height + transposed_x] = tile[threadIdx.x][threadIdx.y];
    }
}

void matrixTransposeHost(float *h_in, float *h_out, int width, int height) {
    float *d_in, *d_out;
    size_t size = width * height * sizeof(float);

    cudaMalloc((void **)&d_in, size);
    cudaMalloc((void **)&d_out, size);
    cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);

    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((width + BLOCK_SIZE - 1) / BLOCK_SIZE, (height + BLOCK_SIZE - 1) / BLOCK_SIZE);

    matrixTransposeShared<<<gridDim, blockDim>>>(d_in, d_out, width, height);
    cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);

    cudaFree(d_in);
    cudaFree(d_out);
}
//print first few values of the matrix.
void printMatrix(float *matrix, int rows, int cols) {
    for (int i = 0; i < 8; i++) {
        for (int j = 0; j < 8; j++) {
            printf("%4.1f ", matrix[i * cols + j]);
        }
        printf("\n");
    }
}

int main() {
    int width = 1024, height = 1024;
    float* h_in = (float*)malloc(width * height * sizeof(float));
    float* h_out = (float*)malloc(width * height * sizeof(float));

    for (int i = 0; i < height; i++)
        for (int j = 0; j < width; j++)
            h_in[i * width + j] = i * width + j;

    printf("Original Matrix:\n");
    printMatrix(h_in, height, width);

    matrixTransposeHost(h_in, h_out, width, height);

    printf("\nTransposed Matrix:\n");
    printMatrix(h_out, width, height);

    free(h_in);
    free(h_out);

    return 0;
}


Overwriting trans.cu


In [25]:
!nvcc -gencode=arch=compute_75,code=sm_75 -o trans trans.cu

In [26]:
!nvprof ./trans

Original Matrix:
 0.0  1.0  2.0  3.0  4.0  5.0  6.0  7.0 
1024.0 1025.0 1026.0 1027.0 1028.0 1029.0 1030.0 1031.0 
2048.0 2049.0 2050.0 2051.0 2052.0 2053.0 2054.0 2055.0 
3072.0 3073.0 3074.0 3075.0 3076.0 3077.0 3078.0 3079.0 
4096.0 4097.0 4098.0 4099.0 4100.0 4101.0 4102.0 4103.0 
5120.0 5121.0 5122.0 5123.0 5124.0 5125.0 5126.0 5127.0 
6144.0 6145.0 6146.0 6147.0 6148.0 6149.0 6150.0 6151.0 
7168.0 7169.0 7170.0 7171.0 7172.0 7173.0 7174.0 7175.0 
==3321== NVPROF is profiling process 3321, command: ./trans

Transposed Matrix:
 0.0 1024.0 2048.0 3072.0 4096.0 5120.0 6144.0 7168.0 
 1.0 1025.0 2049.0 3073.0 4097.0 5121.0 6145.0 7169.0 
 2.0 1026.0 2050.0 3074.0 4098.0 5122.0 6146.0 7170.0 
 3.0 1027.0 2051.0 3075.0 4099.0 5123.0 6147.0 7171.0 
 4.0 1028.0 2052.0 3076.0 4100.0 5124.0 6148.0 7172.0 
 5.0 1029.0 2053.0 3077.0 4101.0 5125.0 6149.0 7173.0 
 6.0 1030.0 2054.0 3078.0 4102.0 5126.0 6150.0 7174.0 
 7.0 1031.0 2055.0 3079.0 4103.0 5127.0 6151.0 7175.0 
==3321== Profiling appl