## CUDA Matrix Multiplication

This project is used to perform the matrix multipliacation on CUDA using SGEMM algorithm, and comparing with the build-in CUBLAS implement. Different versions of code applying different levels of optimization techniques and show the difference in performance.

First set up the experiments parameters.

In [1]:
import numpy as np
import struct
from numpy.random import *

# parameters set up
N = 128
H = 128
W = 128
K = 128

Generate the data of two different Matrixes and save to the files for later usage.  
Note: the data should be in binary format and the data type is 32 bits float.

In [56]:
# generate random data and write to file
Matrix1 = (np.array( rand(N * H * K ) )*2-1.0).astype(np.float32)
# Matrix1 = np.ones(N * H * K).astype(np.float32)
des = open("./data/Matrix1.bin","wb")
cnt = des.write(Matrix1)
des.close()

Matrix2 = (np.array( rand(N * K * W ) )*2-1.0).astype(np.float32)
# Matrix2 = (np.ones(N * K * W )).astype(np.float32)
des = open("./data/Matrix2.bin","wb")
cnt = des.write(Matrix2)
des.close()

Matrix2Trans = Matrix2.reshape((N,K,W)).transpose((0,2,1)).reshape(-1)
des = open("./data/Matrix2Trans.bin","wb")
cnt = des.write(Matrix2Trans)
des.close()

Read in the data from the file.

In [2]:
#readin the data from the file
src = open("./data/Matrix1.bin","rb")
context = src.read()
# print(len(context))
real_context = struct.unpack(str(N * H * K)+'f',context)
M1 = np.array(real_context).reshape((N,H,K)).astype(np.float32)

src = open("./data/Matrix2.bin","rb")
context = src.read()
# print(len(context))
real_context = struct.unpack(str(N * K * W)+'f',context)
M2 = np.array(real_context).reshape((N,K,W)).astype(np.float32)

Excute the Matrix Multiplication using python program and get result.

In [3]:
def MatrixMulBatched(a,b):
    # a and b represents the matrix1 and matrix2
    N = a.shape[0]
    H = a.shape[1]
    K = a.shape[2]
    W = b.shape[2]
    c = np.zeros((N,H,W)).astype(np.float32)
    for n in range(N):
        c[n] = a[n] @ b[n]
    return c

# input = np.ones(parameters1).astype(np.float32).reshape((Aside,Aside,128))
# kernel = np.ones(parameters2).astype(np.float32).reshape((128,128,3,3)).transpose((0,2,3,1))
Output = MatrixMulBatched(M1,M2)
print(Output.shape)

(128, 128, 128)


Test the correctness of specific output

In [40]:
src = open("./data/MM_v4_Result.bin","rb")
context = src.read(4*N*H*W)
real_context = struct.unpack(str(N*H*W)+'f',context)

real=  np.array(real_context)
CublasOutput = real.reshape((N,H,W))

#test corectness
print(np.sum(CublasOutput))
print(np.sum(Output))
err = np.abs(CublasOutput - Output)
print(np.sum(err))

292.91947444703646
292.9212
0.728965970139825


292.91947444703646
292.9212
0.728965970139825


In [22]:
print(CublasOutput.shape)
print(Output.shape)
print(CublasOutput)
print(Output)

(128, 128, 128)
(128, 128, 128)
[[[-2.21413469e+00 -4.55426502e+00  4.55311251e+00 ...  1.79823145e-01
   -4.23783875e+00 -8.85536551e-01]
  [-1.06576252e+00  2.14336777e+00 -2.19085193e+00 ... -5.84645271e-01
    2.93994212e+00 -2.30688047e+00]
  [ 2.86819863e+00  2.03933215e+00  5.06874704e+00 ...  3.17762518e+00
   -9.99716759e-01 -6.95433676e-01]
  ...
  [ 7.44822407e+00  3.16363764e+00  2.38056850e+00 ...  3.30776811e+00
    1.62043440e+00 -2.73796058e+00]
  [ 1.46651304e+00 -1.85272932e+00  1.42630315e+00 ... -4.79458392e-01
   -2.37152958e+00 -5.13682318e+00]
  [-1.96589410e+00  5.11394501e+00 -5.23660421e+00 ... -2.40928483e+00
   -1.15519039e-01  3.95207548e+00]]

 [[ 5.28455114e+00 -2.26028967e+00  1.98828733e+00 ...  3.04576814e-01
   -2.64663792e+00 -9.64100838e+00]
  [ 4.90913534e+00  9.51410294e-01  9.70646679e-01 ... -4.83595461e-01
    1.06334805e+00 -4.31799233e-01]
  [ 4.96638441e+00  4.28658533e+00  4.84426451e+00 ... -1.92319334e+00
   -1.43423092e+00  1.11799133e+0