<a href="https://colab.research.google.com/github/ArnavMehrotra/ArNet/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports and Shared Object Creation


In [232]:
%%shell
if [ ! -d ArNet ]; then
  git clone https://github.com/ArnavMehrotra/ArNet
fi



In [233]:
%%shell
cd ArNet/
git pull

Already up to date.




In [234]:
import time
import ctypes
import numpy as np
import os

MAX_SIZE = 4096 * 2

In [235]:
!rm -rf *.so
so_name = f"ops_{int(time.time())}.so"
!nvcc -Xcompiler -fPIC -shared -gencode arch=compute_80,code=sm_80 -o {so_name} ArNet/launch.cu ArNet/kernels.cu ArNet/pipeline.cu
LIB = ctypes.CDLL(f"./{so_name}")

#CUDA Wrappers

In [236]:
def gemmInt(a: np.array, b: np.array, lib: ctypes.CDLL) -> np.array:
  j, k = a.shape
  m, n = b.shape

  if(m != k):
    print("matrix dimensions do not match")
    return

  N = j * n
  op1 = np.array(a, dtype=np.int32)
  op2 = np.array(b, dtype=np.int32)

  out = np.zeros(N, dtype=np.int32)

  lib.launchMultInt.argtypes =  [ctypes.POINTER(ctypes.c_int),
                              ctypes.POINTER(ctypes.c_int),
                              ctypes.POINTER(ctypes.c_int),
                              ctypes.c_int,
                              ctypes.c_int,
                              ctypes.c_int,
                              ctypes.c_int]

  a_ptr = op1.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
  b_ptr = op2.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
  c_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_int))

  lib.launchMultInt(a_ptr, b_ptr, c_ptr, j, k, m, n)

  c_np = np.ctypeslib.as_array(c_ptr, (N,)).reshape(j, n)

  return c_np

def gemm(a: np.array, b: np.array, lib: ctypes.CDLL) -> np.array:

  if(a.dtype != np.float32 or b.dtype != np.float32):
    print("data type must be float32")
  j, k = a.shape
  m, n = b.shape

  if(m != k):
    print("matrix dimensions do not match")
    return

  N = j * n

  out = np.zeros(N, dtype=np.float32)

  lib.launchMult.argtypes =  [ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.c_int,
                              ctypes.c_int,
                              ctypes.c_int,
                              ctypes.c_int]

  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  b_ptr = b.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  c_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchMult(a_ptr, b_ptr, c_ptr, j, k, m, n)

  c_np = np.ctypeslib.as_array(c_ptr, (N,)).reshape(j, n)

  return c_np


def gemm2(a: np.array, b: np.array, lib: ctypes.CDLL) -> np.array:
  if(a.dtype != np.float32 or b.dtype != np.float32):
    print("data type must be float32")
  j, k = a.shape
  m, n = b.shape

  if(m != k):
    print("matrix dimensions do not match")
    return

  N = j * n

  out = np.zeros(N, dtype=np.float32)

  lib.launchMult2.argtypes =  [ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.c_int,
                              ctypes.c_int,
                              ctypes.c_int,
                              ctypes.c_int]

  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  b_ptr = b.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  c_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchMult2(a_ptr, b_ptr, c_ptr, j, k, m, n)

  c_np = np.ctypeslib.as_array(c_ptr, (N,)).reshape(j, n)

  return c_np

def gradient(a: np.array, y: np.array, lib: ctypes.CDLL):
  if a.dtype != np.float32:
    print("data type must be float32")
    return

  if y.dtype != np.uint32:
    print("label index type must be uint32")
    return

  j, k = a.shape
  N = j * k

  out = np.zeros(N, dtype=np.float32)

  lib.launchGradient.argtypes =  [ctypes.POINTER(ctypes.c_float),
                                  ctypes.POINTER(ctypes.c_uint32),
                                  ctypes.POINTER(ctypes.c_float),
                                  ctypes.c_int,
                                  ctypes.c_int]


  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  y_ptr = y.ctypes.data_as(ctypes.POINTER(ctypes.c_uint32))
  b_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchGradient(a_ptr, y_ptr, b_ptr, j, k)

  b_np = np.ctypeslib.as_array(b_ptr, (N,)).reshape(j, k)

  return b_np

def biasAdd(a: np.array, b: np.array, lib: ctypes.CDLL) -> np.array:
  if a.dtype != np.float32 or b.dtype != np.float32:
    print("data type must be float32")

  j, k = a.shape
  n = b.shape[0]

  if k != n:
    print("matrix dimensions do not match")
    return

  N = j * k

  out = np.zeros(N, dtype=np.float32)

  lib.launchBiasAdd.argtypes =  [ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.c_int,
                              ctypes.c_int]

  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  b_ptr = b.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  c_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchBiasAdd(a_ptr, b_ptr, c_ptr, j, k)

  c_np = np.ctypeslib.as_array(c_ptr, (N,)).reshape(j, k)

  return c_np

def scalarAdd(a: np.array, s: float, lib: ctypes.CDLL) -> np.array:
  if a.dtype != np.float32:
    print("data type must be float32")

  j, k = a.shape
  N = j * k

  out = np.zeros(N, dtype=np.float32)

  lib.launchScalarAdd.argtypes =  [ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.c_float,
                              ctypes.c_int]

  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  c_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchScalarAdd(a_ptr, c_ptr, s, N)

  c_np = np.ctypeslib.as_array(c_ptr, (N,)).reshape(j, k)

  return c_np

def matAdd(a: np.array, b: np.array, lib: ctypes.CDLL) -> np.array:

  if(a.dtype != np.float32 or b.dtype != np.float32):
    print("data type must be float32")

  j, k = a.shape
  m, n = b.shape

  if m != j or n != k:
    print("matrix dimensions do not match")
    return

  N = j * k

  out = np.zeros(N, dtype=np.float32)

  lib.launchAdd.argtypes =  [ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.c_int,
                              ctypes.c_int]

  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  b_ptr = b.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  c_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchAdd(a_ptr, b_ptr, c_ptr, j, k)

  c_np = np.ctypeslib.as_array(c_ptr, (N,)).reshape(j, k)

  return c_np

def relu(a: np.array, lib: ctypes.CDLL) -> np.array:

  if(a.dtype != np.float32):
    print("data type must be float32")

  j, k = a.shape

  N = j * k

  out = np.zeros(N, dtype=np.float32)

  lib.launchRelu.argtypes =  [ctypes.POINTER(ctypes.c_float),
                              ctypes.POINTER(ctypes.c_float),
                              ctypes.c_int]

  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  b_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchRelu(a_ptr, b_ptr, N)

  c_np = np.ctypeslib.as_array(b_ptr, (N,)).reshape(j, k)

  return c_np

def softmax(a: np.array, lib: ctypes.CDLL) -> np.array:
  if a.dtype != np.float32:
    print("data type must be float32")

  j, k = a.shape
  N = j * k

  out = np.zeros(N, dtype=np.float32)

  lib.launchSoftmax.argtypes =  [ctypes.POINTER(ctypes.c_float),
                                 ctypes.POINTER(ctypes.c_float),
                                 ctypes.c_int,
                                 ctypes.c_int]
  a_ptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
  b_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

  lib.launchSoftmax(a_ptr, b_ptr, j, k)
  c_np = np.ctypeslib.as_array(b_ptr, (N,)).reshape(j, k)

  return c_np

#Test Functions

In [237]:
PRINT = False

def test_gemm(lib: ctypes.CDLL):
  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)
  m = k
  n = np.random.randint(2, MAX_SIZE)

  hi = np.random.rand(j, k).astype(np.float32) * 100
  hello = np.random.rand(m, n).astype(np.float32) * 100

  start = time.time()
  correct = hi @ hello
  end = time.time()

  if PRINT: print(f"Total time for numpy: {(end - start)*1000:.3f} ms")

  start = time.time()
  test = gemm(hi, hello, lib)
  end = time.time()

  if PRINT: print(f"Total time for young arn: {(end - start)*1000:.3f} ms")

  start = time.time()
  test2 = gemm2(hi, hello, lib)
  end = time.time()

  if PRINT:
    print(f"Total time for young arn (optimized): {(end - start)*1000:.3f} ms")
    print("gemm")
    print(f"input 1: {j}x{k} matrix")
    print(f"input 2: {m}x{n} matrix")
    print(f"output: {j}x{n} matrix")
    print("\n")

  good = np.allclose(correct, test, rtol=1e-3, atol=1e-3) and np.allclose(correct, test2, rtol=1e-3, atol=1e-3)

  return good

def test_biasAdd(lib: ctypes.CDLL):
  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)

  a = np.random.rand(j, k).astype(np.float32)
  b = np.random.rand(k).astype(np.float32)

  correct = a + b
  test = biasAdd(a, b, lib)
  good = np.allclose(correct, test, rtol=1e-3, atol=1e-3)

  if PRINT:
    print("bias add")
    print(f"input 1: {j}x{k} matrix")
    print(f"input 2: 1x{k} matrix")
    print(f"output: {j}x{k} matrix")
    print("\n")

  return good

def test_scalarAdd(lib: ctypes.CDLL):

  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)

  a = np.random.rand(j, k).astype(np.float32)
  s = np.random.rand()

  correct = a + s
  test = scalarAdd(a, s, lib)
  good = np.allclose(correct, test, rtol=1e-3, atol=1e-3)

  if PRINT:
    print("scalar add")
    print(f"input: {j}x{k} matrix")
    print("\n")

  return good

def test_matAdd(lib: ctypes.CDLL):

  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)

  a = np.random.rand(j, k).astype(np.float32)
  b = np.random.rand(j, k).astype(np.float32)

  correct = a + b
  test = matAdd(a, b, lib)

  good = np.allclose(correct, test, rtol=1e-3, atol=1e-3)

  if PRINT:
    print("mat add")
    print(f"input 1: {j}x{k} matrix")
    print(f"input 2: {j}x{k} matrix")
    print(f"output: {j}x{k} matrix")
    print("\n")

  return good

def test_relu(lib: ctypes.CDLL):

  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)

  a = np.random.randn(j, k).astype(np.float32)

  correct = np.maximum(a, 0)
  test = relu(a, lib)

  good = np.allclose(correct, test, rtol=1e-3, atol=1e-3)

  if PRINT:
    print("relu")
    print(f"input: {j}x{k} matrix")
    print("\n")

  return good

def numpy_softmax(Z):
    Z_stable = Z - np.max(Z, axis=1, keepdims=True)
    exp_Z = np.exp(Z_stable)
    return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)

def test_softmax(lib: ctypes.CDLL):
  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)

  a = np.random.randn(j, k).astype(np.float32) * 100

  test = softmax(a, lib)

  check = numpy_softmax(a)

  good = np.allclose(test, check, rtol=1e-3, atol=1e-3)

  if PRINT:
    print("softmax")
    print(f"input: {j}x{k} matrix")
    print("\n")

  return good

def test_gradient(lib: ctypes.CDLL):
  j = np.random.randint(2, MAX_SIZE)
  k = np.random.randint(2, MAX_SIZE)

  a = np.random.rand(j, k).astype(np.float32)
  y = np.random.randint(0, k, size=(j, 1)).astype(np.uint32)

  test = gradient(a, y, lib)

  check = numpy_softmax(a)
  check[np.arange(j), y.squeeze()] -= 1

  good = np.allclose(test, check, rtol=1e-3, atol=1e-3)

  if PRINT:
    print("gradient")
    print(f"input 1: {j}x{k} matrix")
    print(f"input 2: {j}x1 matrix")
    print("\n\n")

  return good

def test_mlp(lib: ctypes.CDLL):
    j, k = 4, 8
    m, n = 16, 3

    x = np.random.rand(j, k).astype(np.float32) * 10
    w1 = np.random.rand(k, m).astype(np.float32)
    b1 = np.random.rand(m).astype(np.float32) * 10
    w2 = np.random.rand(m, n).astype(np.float32)
    b2 = np.random.rand(n).astype(np.float32) * 10

    out = np.zeros(j * n, dtype=np.float32)

    lib.forward_pass.argtypes = [
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
    ]

    out_ptr = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    lib.forward_pass(x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                    w1.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                    b1.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                    w2.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                    b2.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                    out_ptr, j, k, m, n)

    test = np.ctypeslib.as_array(out_ptr, (j, n)).reshape(j, n)
    correct = numpy_softmax(np.maximum(x @ w1 + b1, 0) @ w2 + b2)

    good = np.allclose(test, correct, rtol=1e-3, atol=1e-3)

    return good

def run_tests(lib: ctypes.CDLL):
    assert test_gemm(lib), "gemm failed"
    assert test_matAdd(lib), "matAdd failed"
    assert test_scalarAdd(lib), "scalarAdd failed"
    assert test_relu(lib), "relu failed"
    assert test_softmax(lib), "softmax failed"
    assert test_gradient(lib), "gradient failed"
    assert test_biasAdd(lib), "biasAdd failed"
    assert test_mlp(lib), "mlp failed"

    print("All tests passed!!")

#Model Usage

In [238]:
run_tests(LIB)

All tests passed!!
