In [112]:
import numpy.typing as npt
import numpy as np
import torch
from typing import BinaryIO, IO
import os

In [113]:
def run_get_batch(
    dataset: npt.NDArray, batch_size: int, context_length: int, device: str
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Given a dataset (a 1D numpy array of integers) and a desired batch size and
    context length, sample language modeling input sequences and their corresponding
    labels from the dataset.

    Args:
        dataset (np.array): 1D numpy array of integer token IDs in the dataset.
        batch_size (int): Desired batch size to sample.
        context_length (int): Desired context length of each sampled example.
        device (str): PyTorch device string (e.g., 'cpu' or 'cuda:0') indicating the device
            to place the sampled input sequences and labels on.

    Returns:
        Tuple of torch.LongTensors of shape (batch_size, context_length). The first tuple item
        is the sampled input sequences, and the second tuple item is the corresponding
        language modeling labels.
    """
    raise NotImplementedError

In [114]:
dataset:npt.NDArray
dataset = np.arange(10)
data_len = len(dataset)
batch_size = 10
context_length = 3
device = "cpu"

In [115]:
np.random.seed(42)
start_pos = np.random.randint(0,data_len-context_length,batch_size)
# 起点下标下限为0，上限为data_len-context_length-1
# 最靠后的序列末端下标为data_len-2
# 最靠后的序列末端对应的label下标为data_len-1
# 正好就是data_len的最后一个下标
start_pos

array([6, 3, 4, 6, 2, 4, 4, 6, 1, 2])

In [116]:
seqs = [dataset[start:start+context_length] for start in start_pos]
labels = [dataset[start+1:start+context_length+1] for start in start_pos]

In [117]:
print(seqs)
print(labels)

[array([6, 7, 8]), array([3, 4, 5]), array([4, 5, 6]), array([6, 7, 8]), array([2, 3, 4]), array([4, 5, 6]), array([4, 5, 6]), array([6, 7, 8]), array([1, 2, 3]), array([2, 3, 4])]
[array([7, 8, 9]), array([4, 5, 6]), array([5, 6, 7]), array([7, 8, 9]), array([3, 4, 5]), array([5, 6, 7]), array([5, 6, 7]), array([7, 8, 9]), array([2, 3, 4]), array([3, 4, 5])]


In [118]:
seqs_np = np.stack(seqs)

In [119]:
seqs_tensor = torch.from_numpy(seqs_np).to(device=device)
seqs_tensor.device

device(type='cpu')

In [120]:
def run_get_batch(
    dataset: npt.NDArray, batch_size: int, context_length: int, device: str
) -> tuple[torch.Tensor, torch.Tensor]:
    np.random.seed(42)
    start_pos = np.random.randint(0,data_len-context_length,batch_size)
    # 起点下标下限为0，上限为data_len-context_length-1
    # 最靠后的序列末端下标为data_len-2
    # 最靠后的序列末端对应的label下标为data_len-1
    # 正好就是data_len的最后一个下标
    seqs = [dataset[start:start+context_length] for start in start_pos]
    labels = [dataset[start+1:start+context_length+1] for start in start_pos]
    seqs_np = np.stack(seqs)
    seqs_tensor = torch.from_numpy(seqs_np).to(device=device)
    labels_np = np.stack(labels)
    labels_tensor = torch.from_numpy(labels_np).to(device=device)
    return seqs_tensor,labels_tensor

In [None]:
from cs336_basics.mymodule import Linear
from cs336_basics.myoptimizer import MyAdamW
test_module = Linear(10,20)
test_opt = MyAdamW()

Linear()

In [None]:
def my_save_checkpoint(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    iteration: int,
    out: str | os.PathLike | BinaryIO | IO[bytes],
):
    """
    Given a model, optimizer, and an iteration number, serialize them to disk.

    Args:
        model (torch.nn.Module): Serialize the state of this model.
        optimizer (torch.optim.Optimizer): Serialize the state of this optimizer.
        iteration (int): Serialize this value, which represents the number of training iterations
            we've completed.
        out (str | os.PathLike | BinaryIO | IO[bytes]): Path or file-like object to serialize the model, optimizer, and iteration to.
    """
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'iteration': iteration
    }, out)