In [1]:
from numba import cuda, jit, float32
import numpy as np

In [2]:
cuda.detect()

Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      compute capability: 7.5
                           pci device id: 4
                              pci bus id: 0
Summary:
	1/1 devices are supported


True

In [3]:
A = np.random.rand(20,20)
x = np.random.rand(20)
b = np.random.rand(20)
out = np.zeros((20))

In [4]:
TPB = 10

@jit(nopython=True)
def inner_product_for_grad(x, y, b):
    out = 0.
    
    for i in range(x.size):
        out += x[i] * y[i]
    
    out -= b

    return out

@cuda.jit
def indexing(A, x, b, out):
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    bpg = cuda.gridDim.x

    if tx < 10:
        tmp = 0.
        for j in range(bpg):
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            out[tx + j * TPB] = sB[tx]

In [5]:
indexing[(2,2),(TPB,TPB)](A, x, b, out)
print(out)
print()
print((A@x - b))

[3.47429872 3.4129703  3.27829051 3.29575944 3.49402332 3.89749265
 2.40560675 4.14019489 4.48806572 4.25679398 2.85725975 3.26104164
 3.20081925 3.54018331 2.3461616  3.15463948 3.02659059 2.96312451
 2.85414481 3.55413032]

[3.47429866 3.41297041 3.2782904  3.29575954 3.49402336 3.89749268
 2.40560679 4.14019467 4.4880659  4.25679376 2.85725966 3.2610417
 3.20081927 3.54018342 2.3461616  3.15463938 3.02659063 2.96312442
 2.85414481 3.55413036]


In [6]:
BPG = 32
TPB = 32

@cuda.jit
def gradient(A, x, b, out):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]

            cuda.syncthreads()

        out[tx + bx * TPB] = tmp

In [7]:
n = BPG * TPB
A = np.random.rand(n,n)
b = np.random.rand(n)
x = np.random.rand(n)
out = np.zeros((n))

In [8]:
gradient[(BPG,BPG),(TPB,TPB)](A, x, b, out)

In [9]:
A.T@(A@x - b)

array([134288.82738812, 136136.69233882, 136106.71700398, ...,
       134881.82271574, 134138.37363532, 134733.90053899])

In [10]:
out

array([134288.82765284, 136136.69284191, 136106.7169385 , ...,
       134881.82333575, 134138.37399321, 134733.90082151])

In [11]:
print(np.linalg.norm(out - (A.T@(A@x - b))))

0.007438289479668126


In [12]:
@cuda.jit
def optimizer(A, x, b, out):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]

            cuda.syncthreads()

        out[tx + bx * TPB] = tmp

In [13]:
A = np.random.rand(1000,1000)
b = np.random.rand(1000)
x = np.random.rand(1000)
out = np.zeros((1000))

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.to_device(out)

lr = 1e-3 / A.shape[1]

TPB = 32
BPG = int(np.ceil(A.shape[0] / TPB))

In [14]:
grad = A.T @ (A @ x - b)
optimizer[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_)

In [15]:
out = out_.copy_to_host()
np.linalg.norm(grad - out)

0.00533660276908258

In [16]:
x -= grad * lr
x_ -= out * lr

In [17]:
print(x[:10])
print()
print(x_[:10])
print(np.linalg.norm(x - x_))

[0.12342575 0.58319044 0.14282887 0.84894367 0.51489227 0.26059616
 0.26082328 0.03895291 0.58964951 0.10961069]

[0.12342575 0.58319044 0.14282887 0.84894367 0.51489227 0.26059616
 0.26082328 0.03895291 0.58964951 0.10961069]
5.3366027317270625e-09


In [18]:
%%time 
for i in range(500):
    grad = A.T @ (A @ x - b)

CPU times: user 706 ms, sys: 55.5 ms, total: 761 ms
Wall time: 402 ms


In [20]:
out_ = cuda.to_device(out)

In [21]:
%%time
for i in range(500):
    optimizer[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_)

KeyboardInterrupt: ignored

In [None]:
## Using one GPU 
class LeastSquare():
    def __init__(self, A, b, epoches=10, TPB=16):
        self.A = A
        self.b = b
        self.lr = 1e-3/A.shape[1]
        self.epoches = epoches
        self.x = cuda.to_device(np.random.rand(A.shape[1]))
        self.x_hat = cuda.device_array((A.shape[1]))
        self.error_list = []
        self.grad = cuda.device_array((A.shape[1]))

        ## About kernel, Configure the blocks
        self.threadsperblock = (TPB,TPB) 
        blockspergrid_x = int(np.ceil(A.shape[0] / self.threadsperblock[1]))
        blockspergrid_y = int(np.ceil(A.shape[1] / self.threadsperblock[0]))
        self.blockspergrid = (blockspergrid_x, blockspergrid_y)
        
    def run(self):
        for i in range(self.epoches):
            A, b = self.initialize()
            self.optimize(A, b, self.x)

        return self.x_hat

    def initialize(self):
        index = np.random.choice(self.A.shape[0], 1000)
        A = cuda.to_device(self.A[index,:])
        b = cuda.to_device(self.b[index])

        return A, b

    def optimize(self, A, b, x, iters_per_epoch=500):
        
        for i in range(iters_per_epoch):
            optimizer[self.blockspergrid, self.threadsperblock](A, x, b, lr)

    def check(self, x):
        b_hat = self.A @ x
        error = np.linalg.norm(self.b - b_hat)
        self.error_list.append(error)

        return error