In [None]:
from collections.abc import Callable, Iterable
from typing import Optional
import torch
import math

class SGD(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3):
        if lr < 0:
            raise ValueError(f"Invalid learning rate: {lr}")
        # 模型内部存储：学习率
        defaults = {"lr": lr}
        # 父类中存储：各个参数，学习率（还可以传入其他自己想要传入的内容，以字典形式）
        super().__init__(params, defaults)
        
    def step(self, closure: Optional[Callable] = None): # 
        loss = None if closure is None else closure()
        for group in self.param_groups:
            lr = group["lr"] # Get the learning rate.
            for p in group["params"]:
                if p.grad is None:
                    continue
                state = self.state[p] # Get state associated with p.
                t = state.get("t", 0) # Get iteration number from the state, or initial value.
                grad = p.grad.data # Get the gradient of loss with respect to p.
                p.data -= lr / math.sqrt(t + 1) * grad # Update weight tensor in-place.
                state["t"] = t + 1 # Increment iteration number.
        return loss


In [242]:
weights = torch.nn.Parameter(5 * torch.randn((10, 10)))
opt = SGD([weights], lr=200)
for t in range(10):
    opt.zero_grad() # Reset the gradients for all learnable parameters.
    loss = (weights**2).mean() # Compute a scalar loss value.
    print(loss.cpu().item())
    loss.backward() # Run backward pass, which computes gradients.
    opt.step() # Run optimizer step.

28.92365264892578
260.3128662109375
870.2637329101562
1492.09423828125
1492.0941162109375
928.5170288085938
372.0384826660156
97.47351837158203
16.72380828857422
1.8582000732421875


In [None]:
# class SGD(torch.optim.Optimizer):
#     def __init__(self, params, lr=1e-3):
#         if lr < 0:
#             raise ValueError(f"Invalid learning rate: {lr}")
#         # 模型内部存储：学习率
#         defaults = {"lr": lr}
#         # 父类中存储：各个参数，学习率（还可以传入其他自己想要传入的内容，以字典形式）
#         super().__init__(params, defaults)
        
#     def step(self, closure: Optional[Callable] = None): # 
#         loss = None if closure is None else closure()
#         for group in self.param_groups:
#             lr = group["lr"] # Get the learning rate.
#             for p in group["params"]:
#                 if p.grad is None:
#                     continue
#                 state = self.state[p] # Get state associated with p.
#                 t = state.get("t", 0) # Get iteration number from the state, or initial value.
#                 grad = p.grad.data # Get the gradient of loss with respect to p.
#                 p.data -= lr / math.sqrt(t + 1) * grad # Update weight tensor in-place.
#                 state["t"] = t + 1 # Increment iteration number.
#         return loss

from typing import Any, Dict, Tuple
from torch import Tensor

class AdamW(torch.optim.Optimizer):
    def __init__(self, 
                 params: Iterable[Tensor] | Iterable[Dict[str, Any]] | Iterable[Tuple[str, Tensor]], 
                 lr: float = 1e-3,
                 beta1:float = 0.9,
                 beta2:float = 0.95,
                 eps:float = 1e-8,
                 weight_decay:float = 0.1
                 ) -> None:
        if lr < 0 or weight_decay < 0 or eps < 0:
            raise ValueError(f"these params should be positive, \
                             now lr: {lr}, weight decay:{weight_decay},\
                             eps: {eps}")
        if (beta1 >= 1 or beta1 <= 0) or (beta2 >= 1 or beta2 <= 0):
            raise ValueError(f"beta1 and beta2 should be in (0,1), now: {beta1}, {beta2}")
        defaults = {
            "lr":lr,
            "beta1":beta1,
            "beta2":beta2,
            "eps":eps,
            "weight_decay":weight_decay
        }
        super().__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure: Optional[Callable] = None): 
        loss = None
        if closure is not None:
            with torch.enable_grad():
                closure()
        for group in self.param_groups:
            lr = group["lr"]
            beta1 = group["beta1"]
            beta2 = group["beta2"]
            eps = group["eps"]
            weight_decay = group["weight_decay"]
            for p in group["params"]:
                if p.grad is None:
                    continue
                g = p.grad  # 获得梯度
                state = self.state[p] # 获得其他信息
                # self.state是一个字典
                # 其中每个kv对是 参数-参数信息，参数信息也是一个字典
                # 参数信息可以随自己心意存储，例如"t"存储时间步

                #最初：m和v为0，t没有
                # 当伪代码中t=1的循环时，做的事：
                # 0. 计算梯度
                # 1. 更新m（用0作为初值）；更新v（用0作为初值）
                # 2. 用当前的t=1来计算当前的学习率
                # 3. 用当前学习率、当前m和当前v更新参数
                # 4. 用权重衰减更新参数
                # 5. t自增1

                t = state.get("t", 1)

                m = state.get("first_moment",torch.zeros_like(p)) # 第一次循环时先置0再更新，后续循环都是获取上一轮值再更新
                m = beta1 * m + (1 - beta1) * g
                v = state.get("second_moment",torch.zeros_like(p))
                v = beta2 * v + (1 - beta2) * (g ** 2)
                lr_t = lr * math.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)

                p -= lr_t * m / (torch.sqrt(v) + eps)
                p -= lr * weight_decay * p

                state[t] = t + 1
        return loss


NameError: name 'torch' is not defined

In [1]:
import torch
x = torch.arange(12).reshape(3,4)
x

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [10]:
x = x.to(torch.float)
torch.norm(x)

tensor(22.4944)

In [5]:
x ** 2

tensor([[  0,   1,   4,   9],
        [ 16,  25,  36,  49],
        [ 64,  81, 100, 121]])

In [7]:
x ** 2 / (x + 1)

tensor([[ 0.0000,  0.5000,  1.3333,  2.2500],
        [ 3.2000,  4.1667,  5.1429,  6.1250],
        [ 7.1111,  8.1000,  9.0909, 10.0833]])