In [6]:
import torch
print(torch.__version__)

torch.backends.mps.is_available()

import torch
import time

# 设备选择
device_cpu = torch.device("cpu")
device_mps = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# 定义模型
model_cpu = torch.nn.Linear(1000, 1000).to(device_cpu)
model_mps = torch.nn.Linear(1000, 1000).to(device_mps)

# 创建输入数据
inputs_cpu = torch.randn(1000, 1000).to(device_cpu)
inputs_mps = torch.randn(1000, 1000).to(device_mps)

# 测试 CPU 时间
start_time = time.time()
for _ in range(1000):
    outputs_cpu = model_cpu(inputs_cpu)
cpu_time = time.time() - start_time
print(f"CPU Time: {cpu_time:.4f} seconds")

# 测试 MPS 时间
start_time = time.time()
for _ in range(1000):
    outputs_mps = model_mps(inputs_mps)
torch.mps.synchronize()  # 确保 MPS 完成计算
mps_time = time.time() - start_time
print(f"MPS Time: {mps_time:.4f} seconds")


2.2.1
CPU Time: 11.3047 seconds
MPS Time: 1.1047 seconds


# torch.autograd.grad

In [4]:
import torch
print(torch.__version__)

x = torch.tensor(2.0, requires_grad=True)
z = torch.tensor(3.0, requires_grad=True)

y = x ** 2 + z ** 3

# Calculate dy/dx and dy/dz
grad_x, grad_z = torch.autograd.grad(y, [x, z], grad_outputs=torch.tensor(1.0))
print(grad_x)
print(grad_z)


2.2.1
tensor(4.)
tensor(27.)


In [2]:
#Second Derevative
x = torch.tensor(2.0, requires_grad=True)
y = x ** 3 

grad_x = torch.autograd.grad(y, x, create_graph=True)
#Note, create_graphs allows to calculate 2nd-derevative, other wise torch will stop tracking gradient
grad2_x = torch.autograd.grad(grad_x, x)

print(grad2_x) 

(tensor(12.),)


In [3]:
x = torch.tensor(2.0, requires_grad=True)
z = torch.tensor(3.0, requires_grad=True)

y = x ** 2  

# dy/dx dy/dz
#grad_x, grad_z = torch.autograd.grad(y, [x, z])#, allow_unused=True)
grad_x, grad_z = torch.autograd.grad(y, [x, z], allow_unused=True)
#If y not depend on some variable, we need to set allow_unused=True so that no error occur

print(grad_x)  
print(grad_z)  


tensor(4.)
None


# torch.autograd.functional.jacobian

In [4]:
# f: R^2 -> R^3 | y(x) = (y1(x1,x2),y2(x1,x2),y3(x1,x2))

#This is a wrong method, if return a new tensor, then the computation graph is not tracked
#def func(x):
    #return torch.tensor([
        #x[0] ** 2,   # y1(x1,x2) = x1^2
        #x[1] ** 3,   # y2(x1,x2) = x2^3
        #x[0] * x[1]  # y3(x1,x2) = x1 * x2
    #])

def func(x):
    return torch.stack([
        x[0] ** 2,   # y1(x1, x2) = x1^2
        x[1] ** 3,   # y2(x1, x2) = x2^3
        x[0] * x[1]  # y3(x1, x2) = x1 * x2
    ])


x = torch.tensor([2.0, 3.0], requires_grad=True)

# Compute Jacobian
J = torch.autograd.functional.jacobian(func, x)

print(J)


tensor([[ 4.,  0.],
        [ 0., 27.],
        [ 3.,  2.]])


# Taichi

In [None]:
import taichi as ti
import taichi.math as tm

ti.init(arch=ti.gpu)

n = 320
pixels = ti.field(dtype=float, shape=(n * 2, n))

"""
@ti.func
def complex_sqr(z):  # complex square of a 2D vector
    return tm.vec2(z[0] * z[0] - z[1] * z[1], 2 * z[0] * z[1])

@ti.kernel
def paint(t: float):
    for i, j in pixels:  # Parallelized over all pixels
        c = tm.vec2(-0.8, tm.cos(t) * 0.2)
        z = tm.vec2(i / n - 1, j / n - 0.5) * 2
        iterations = 0
        while z.norm() < 20 and iterations < 50:
            z = complex_sqr(z) + c
            iterations += 1
        pixels[i, j] = 1 - iterations * 0.02

gui = ti.GUI("Julia Set", res=(n * 2, n))

i = 0
while gui.running:
    paint(i * 0.03)
    gui.set_image(pixels)
    gui.show()
    i += 1
    
"""

[Taichi] Starting on arch=metal


KeyboardInterrupt: 

: 

In [4]:
import taichi as ti
import taichi.math as tm

ti.init(arch=ti.gpu)

n = 320
pixels = ti.field(dtype=float, shape=(n * 2, n))

type(tm.vec2(-0.8, tm.cos(0) * 0.2))

[Taichi] Starting on arch=metal


taichi.lang.matrix.Vector

In [6]:
# Note that: 
#For loops located at the outermost scope in a Taichi kernel are automatically parallelized.

@ti.kernel
def foo():
    for i in x:
        ...
        break # Error!

@ti.kernel
def foo():
    for i in x:
        for j in range(10):
            ...
            break # OK!

# Taichi 4 Python

In [14]:
"""Count the prime numbers in the range [1, n], via traditional python code"""

# Checks if a positive integer is a prime number
def is_prime(n: int):
    result = True
    # Traverses the range between 2 and sqrt(n)
    # - Returns False if n can be divided by one of them;
    # - otherwise, returns True
    for k in range(2, int(n ** 0.5) + 1):
        if n % k == 0:
            result = False
            break
    return result

# Traverses the range between 2 and n
# Counts the primes according to the return of is_prime()
def count_primes(n: int) -> int:
    count = 0
    for k in range(2, n):
        if is_prime(k):
           count += 1

    return count

print(count_primes(1000000))

78498


In [None]:
import taichi as ti
ti.init(arch=ti.cpu)

@ti.func
def is_prime(n: int):
    result = True
    for k in range(2, int(n ** 0.5) + 1):
        if n % k == 0:
            result = False
            break
    return result

@ti.kernel
def count_primes(n: int) -> int:
    count = 0
    for k in range(2, n): #This outerloop is parallelized
        if is_prime(k):
            count += 1 # Will this be an issue? Using the same variable? 

    return count


print(count_primes(1000000))

[Taichi] Starting on arch=x64
78498


# Taichi for Pytorch

$$y(t)= ∑_{u=0}^{u=t}w(T−1−(t−u))⋅k(u+T−1)+ϵ$$

w is convolution kernel, k is inputing sequence, ϵ is a bias term

In [20]:
def run_formula_very_slow(w, k, B, C, T, eps):
    out = torch.empty((B, C, T), device='cpu')
    for b in range(B):   
        for c in range(C): 
            for t in range(T):  
                s = eps  
                for u in range(t-T+1, t+1):  
                    s += w[c][0][(T-1)-(t-u)] * k[b][c][u+T-1]
                out[b][c][t] = s
    return out


In [21]:
@ti.kernel
def taichi_forward_v0(
        out: ti.types.ndarray(ndim=3),# type: ignore
        w: ti.types.ndarray(ndim=3), # type: ignore
        k: ti.types.ndarray(ndim=3),# type: ignore
        eps: ti.f32):# type: ignore

    for b, c, t in out:  # Parallelize
        s = eps
        for u in range(t-T+1, t+1):  # Convolution
            s += w[c, 0, (T-1)-(t-u)] * k[b, c, u+T-1]
        out[b, c, t] = s


In [26]:
import torch
import torch.nn.functional as F

B, C, T = 2, 3, 5  # 2 batch, 3 channle, every sequence has length 5
x = torch.rand(B, C, T)  # input
w = torch.rand(C, 1, 3)  # convolution kernel (input channel=3, input channel=1, Kernel size=3)

y = F.conv1d(x, w, groups=C)  # groups=C: ensure every channel calculate individually??

print(y.shape)  # (2, 3, 3) - (B, C, T_out), since no padding losing 2 token of length


torch.Size([2, 3, 3])


In [None]:
import taichi as ti

ti.init(arch=ti.cpu)

B, C, T = 2, 3, 5
T_filter = 3  
eps = 0.1  


w = ti.ndarray(dtype=ti.f32, shape=(C, 1, T_filter))  # convolution kernel
k = ti.ndarray(dtype=ti.f32, shape=(B, C, T + T_filter - 1))  # input
out = ti.ndarray(dtype=ti.f32, shape=(B, C, T))  # out

@ti.kernel
def taichi_forward_v0(
        out: ti.types.ndarray(ndim=3),
        w: ti.types.ndarray(ndim=3),
        k: ti.types.ndarray(ndim=3),
        eps: ti.f32):

    for b, c, t in out:  
        s = eps
        for u in range(t - T_filter + 1, t + 1): 
            if 0 <= u + T_filter - 1 < k.shape[2]: 
                s += w[c, 0, (T_filter - 1) - (t - u)] * k[b, c, u + T_filter - 1]
        out[b, c, t] = s

taichi_forward_v0(out, w, k, eps)

[Taichi] Starting on arch=x64


In [None]:
# Differentiable Proramming