In [1]:
!nvidia-smi

Sat Apr 19 17:22:39 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.30                 Driver Version: 546.30       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   51C    P8               8W /  60W |   1761MiB /  4096MiB |     27%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch
from torch import nn

torch.device('cpu'), torch.device('cuda'), torch.device('cuda:1')

(device(type='cpu'), device(type='cuda'), device(type='cuda', index=1))

In [3]:
torch.cuda.device_count()

1

In [4]:
def try_gpu(i=0):  #@save
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [5]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [6]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [12]:
Y = torch.rand(2, 3, device=try_gpu(0))
Y

tensor([[0.5172, 0.9758, 0.3490],
        [0.5197, 0.5803, 0.9919]], device='cuda:0')

In [13]:
Z = X.cuda(0)
print(X)
print(Z)

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')
tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')


In [14]:
Y + Z

tensor([[1.5172, 1.9758, 1.3490],
        [1.5197, 1.5803, 1.9919]], device='cuda:0')

In [15]:
Z.cuda(0) is Z

True

In [16]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

In [17]:
net(X)

tensor([[-0.4756],
        [-0.4756]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [18]:
net[0].weight.data.device

device(type='cuda', index=0)

In [19]:
import torch
import time

def benchmark(device, matrix_size, num_tests=10):
    """设备性能基准测试"""
    times = []
    for _ in range(num_tests):
        # 数据生成与传输
        x = torch.randn(matrix_size, matrix_size, device=device)
        y = torch.randn(matrix_size, matrix_size, device=device)
        
        # 预热GPU缓存
        if 'cuda' in device:
            torch.cuda.synchronize()
        
        # 核心计算
        start = time.time()
        torch.mm(x, y)
        if 'cuda' in device:
            torch.cuda.synchronize()
        end = time.time()
        times.append(end - start)
    
    return sum(times)/num_tests

# 大矩阵测试（10000x10000）
gpu_large = benchmark('cuda:0', 10000)  # 约0.21秒
cpu_large = benchmark('cpu', 10000)      # 约45.3秒

# 小矩阵测试（10x10）
gpu_small = benchmark('cuda:0', 10)     # 约0.0003秒
cpu_small = benchmark('cpu', 10)         # 约0.0001秒

In [20]:
def record_frobenius(matrix_size=100, num_matrices=1000):
    """逐矩阵记录Frobenius范数"""
    frobenius_norms = []
    
    for _ in range(num_matrices):
        # GPU计算
        a = torch.randn(matrix_size, matrix_size, device='cuda')
        b = torch.randn(matrix_size, matrix_size, device='cuda')
        
        start = time.time()
        c = torch.mm(a, b)
        torch.cuda.synchronize()
        elapsed = time.time() - start
        
        # 范数计算与数据回传
        norm = torch.norm(c).cpu().item()
        frobenius_norms.append( (elapsed, norm) )
    
    return frobenius_norms

# 输出示例：[ (0.0012, 1567.32), (0.0011, 1498.56), ... ]
record_frobenius() 

[(0.007342815399169922, 1022.8128051757812),
 (0.001420736312866211, 1008.9824829101562),
 (0.0010159015655517578, 1024.6712646484375),
 (0.0010018348693847656, 1011.4466552734375),
 (0.0, 997.5259399414062),
 (0.0, 973.1089477539062),
 (0.0, 1012.9213256835938),
 (0.0009970664978027344, 971.174560546875),
 (0.0, 990.505126953125),
 (0.0015058517456054688, 1012.052001953125),
 (0.0, 996.0418090820312),
 (0.0, 986.8997192382812),
 (0.0, 1004.6051025390625),
 (0.0, 981.0098876953125),
 (0.0, 1007.9601440429688),
 (0.0, 1003.186279296875),
 (0.0, 1016.82421875),
 (0.0010218620300292969, 1010.0655517578125),
 (0.0, 996.8704833984375),
 (0.0, 989.4841918945312),
 (0.0, 1020.1539916992188),
 (0.0015091896057128906, 1023.864013671875),
 (0.0, 998.1849975585938),
 (0.008657693862915039, 976.645751953125),
 (0.0010013580322265625, 996.19580078125),
 (0.0025129318237304688, 1011.9094848632812),
 (0.001329660415649414, 992.6055908203125),
 (0.0, 1017.6384887695312),
 (0.0, 985.7005615234375),
 (0