## single_gpu

In [1]:
import torch
# 导入 Dataloader
from torch.utils.data import DataLoader, Dataset
# 导入 F 函数
import torch.nn.functional as F
from datautils import MyTrainDataset

In [3]:

class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_data: DataLoader,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int, 
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(self.gpu_id)
        self.train_data = train_data
        self.optimizer = optimizer
        self.save_every = save_every
        
    def _run_batch(self, souce, target):
        self.optimizer.zero_grad()
        output = self.model(souce)
        loss = F.cross_entropy(output, target)
        loss.backward()
        self.optimizer.step()
        
    def _run_epoch(self, epoch):
        b_sz = len(next(iter(self.train_data))[0])
        print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
        for source, target in self.train_data:
            source = source.to(self.gpu_id)
            target = target.to(self.gpu_id)
            self._run_batch(source, target)
            
    def _save_checkpoint(self, epoch):
        ckp = self.model.state_dict()
        PATH = "checkpoint.pt"
        torch.save(ckp, PATH)
        print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")
        
    def train(self, epochs: int):
        for epoch in range(epochs):
            self._run_epoch(epoch)
            if epoch % self.save_every == 0:
                self._save_checkpoint(epoch)

In [2]:
def load_train_obj():
    train_set = MyTrainDataset(2048)
    print("数据加载完成")
    model = torch.nn.Linear(20, 1)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    print("优化器加载完成")
    return train_set, model, optimizer

def prepare_dataLoader(dataset: Dataset, batch_size: int):
    # pin_memory : 是否使用锁页内存，把数据拷贝到 GPU 上会快很多,锁页内存是一种内存，CPU 和 GPU 都可以直接访问，不需要进行数据拷贝
    return DataLoader(dataset, batch_size=batch_size, pin_memory=True, shuffle=True )

def main(device, total_epochs, save_every, batch_size):
    data_set, model, optimizer = load_train_obj()
    train_data = prepare_dataLoader(data_set, batch_size)
    trainer = Trainer(model, train_data, optimizer, device, save_every)
    trainer.train(total_epochs)

In [5]:
# 配置参数并启动
total_epochs = 10
save_every = 2
batch_size = 1
# main(device=0, total_epochs=total_epochs, save_every=save_every, batch_size=batch_size)