In [None]:
!pip install py-cpuinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo
Successfully installed py-cpuinfo-9.0.0


In [None]:
import math
import numpy as np
from numba import cuda, jit, float64
import time
import pandas as pd
import cpuinfo

print('CPU:', cpuinfo.get_cpu_info()['brand_raw'])
print('GPU:', cuda.get_current_device())

CPU: Intel(R) Xeon(R) CPU @ 2.30GHz
GPU: <CUDA device 0 'b'Tesla T4''>


In [None]:
N0 = 100
A0 = np.random.randn(N0, N0)
B0 = np.random.randn(N0, N0)
C0 = np.zeros((N0, N0))

In [None]:
N1 = 1000
A1 = np.random.randn(N1, N1)
B1 = np.random.randn(N1, N1)
C1 = np.zeros((N1, N1))

In [None]:
N2 = 2000
A2 = np.random.randn(N2, N2)
B2 = np.random.randn(N2, N2)
C2 = np.zeros((N2, N2))

In [None]:
def CPU_matmul_dot(A, B):
  C = np.dot(A,B)

s1 = time.time()
CPU_matmul_dot(A0, B0)
host_0 = C0
end0 = time.time() - s1
print(' -----------------------------------')
print('| Matmul on CPU (np.dot): {:.3f}'.format(end0), 'sec.|')
print(' -----------------------------------')

s2 = time.time()
CPU_matmul_dot(A1, B1)
end1 = time.time() - s2
print(' -----------------------------------')
print('| Matmul on CPU (np.dot): {:.3f}'.format(end1), 'sec.|')
print(' -----------------------------------')

s3 = time.time()
CPU_matmul_dot(A2, B2)
end2 = time.time() - s3
print(' -----------------------------------')
print('| Matmul on CPU (np.dot): {:.3f}'.format(end2), 'sec.|')
print(' -----------------------------------')

 -----------------------------------
| Matmul on CPU (np.dot): 0.008 sec.|
 -----------------------------------
 -----------------------------------
| Matmul on CPU (np.dot): 0.157 sec.|
 -----------------------------------
 -----------------------------------
| Matmul on CPU (np.dot): 1.387 sec.|
 -----------------------------------


In [8]:
def CPU_matmul(A, B, C):
  for i in range(C.shape[0]):
    for j in range(C.shape[1]):
      sum = 0
      for k in range(A.shape[1]):
        sum += A[i, k] * B[k, j]
      C[i, j] = sum

host_1 = C0

s4 = time.time()
CPU_matmul(A0, B0, C0)
end3 = time.time() - s4
print(' --------------------------')
print('| Matmul on CPU: {:.3f}'.format(end3), 'sec.|')
print(' --------------------------')

s5 = time.time()
CPU_matmul(A1, B1, C1)
end4 = time.time() - s5
print(' --------------------------')
print('| Matmul on CPU: {:.3f}'.format(end4), 'sec.|')
print(' --------------------------')

s6 = time.time()
CPU_matmul(A2, B2, C2)
end5 = time.time() - s6
print(' ---------------------------')
print('| Matmul on CPU: {:.3f}'.format(end5), 'sec.|')
print(' ---------------------------')

 --------------------------
| Matmul on CPU: 0.518 sec.|
 --------------------------
 --------------------------
| Matmul on CPU: 436.071 sec.|
 --------------------------


KeyboardInterrupt: ignored

Выполнять задачу выше до конца оказалось слишком долго, решил прервать выполнение

In [9]:
def cpu_matmul(A, B, C):
  for i in range(C.shape[0]):
    for j in range(C.shape[1]):
      sum = 0
      for k in range(A.shape[1]):
        sum += A[i, k] * B[k, j]
      C[i, j] = sum

In [10]:
@cuda.jit
def GPU_matmul_cuda_jit(A, B, C):
  i, j = cuda.grid(2)
  if i < C.shape[0] and j < C.shape[1]:
    sum = 0
    for k in range(A.shape[1]):
      sum += A[i, k] * B[k, j]
      C[i, j] = sum


def host_naive(A, B, C):
  d_A = cuda.to_device(A)
  d_B = cuda.to_device(B)
  d_C = cuda.device_array(C.shape, np.float64)

  threadsperblock = (32, 32)
  blockspergrid_x = math.ceil(A.shape[0]/threadsperblock[0])
  blockspergrid_y = math.ceil(B.shape[1]/threadsperblock[1])
  blockspergrid = (blockspergrid_x, blockspergrid_y)

  GPU_matmul_cuda_jit[blockspergrid, threadsperblock](d_A, d_B, d_C)
  return d_C.copy_to_host()


cuda.select_device(0) #выбор устройства GPU
host_naive(A0, B0, C0)
s10 = time.time()
host_naive(A0, B0, C0)
host_3 = C0
end9 = time.time() - s10
print(' --------------------------')
print('| Matmul on GPU: {:.3f}'.format(end9), 'sec.|')
print(' --------------------------')

cuda.select_device(0)
s11 = time.time()
host_naive(A1, B1, C1)
end10 = time.time() - s11
print(' --------------------------')
print('| Matmul on GPU: {:.3f}'.format(end10), 'sec.|')
print(' --------------------------')

cuda.select_device(0)
s12 = time.time()
host_naive(A2, B2, C2)
end11 = time.time() - s12
print(' --------------------------')
print('| Matmul on GPU: {:.3f}'.format(end11), 'sec.|')
print(' --------------------------')



 --------------------------
| Matmul on GPU: 0.002 sec.|
 --------------------------
 --------------------------
| Matmul on GPU: 0.200 sec.|
 --------------------------
 --------------------------
| Matmul on GPU: 0.753 sec.|
 --------------------------
