## 1.背景介绍

本实验主要研究 PyTorch 分布式训练框架下的单机多卡（Single Machine Multi-GPU）并行训练方法，重点探索 torch.distributed 和 torch.nn.parallel.DistributedDataParallel (DDP) 的使用。

随着深度学习模型规模的不断增长，单张 GPU 的计算能力往往难以满足高效训练的需求，因此，利用多张 GPU 进行数据并行加速训练成为一种常见的优化方案。

本实验采用 PyTorch 框架，实现一个基于 Transformer 结构的 NLP 模型，并使用 DDP 进行多 GPU 训练，以验证其性能和正确性。

## 2.实验目的
掌握 PyTorch 分布式训练框架的基本概念，包括 torch.distributed、DistributedDataParallel 等核心 API。

实现基于 DDP 的数据并行训练，在单机多 GPU 设备上进行深度学习模型训练，并理解其工作原理。

理解 DistributedSampler 在多 GPU 训练中的作用，确保数据在多个进程间均匀分配。


## 3.硬件要求

2张 GPU（4090、V100、A100等）或 CPU 。


## 4.技术原理

本实验的核心是 PyTorch 的 DistributedDataParallel (DDP)，它用于在多个 GPU  设备之间高效地并行化训练过程。其基本原理如下：

### 数据并行（Data Parallelism）

- 采用 单机多 GPU 数据并行策略，每张 GPU 运行一个独立的训练进程（采用 torch.multiprocessing 进行进程管理）。

- 训练数据集在多个 GPU 之间划分，每张 GPU 仅处理部分数据，并行计算梯度。

- DistributedSampler 确保每张 GPU 处理的数据不会重复，从而保证训练效率和正确性。

### DDP 训练流程

每个 GPU 进程独立计算梯度。

PyTorch DDP 通过 梯度同步（All-Reduce） 机制，在后向传播时将所有 GPU 的梯度聚合并同步，以确保模型参数在所有 GPU 上保持一致。

## 5.实验流程

### 环境配置



In [20]:
!pip install torch



### 5.1. transformer模型定义

In [11]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import DistributedSampler
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        
    def forward(self, x, mask=None):
        batch_size = x.shape[0]
        
        q = self.q_linear(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-1e9'))
        
        attn_weights = F.softmax(scores, dim=-1)
        output = torch.matmul(attn_weights, v)
        
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.out_linear(output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        return self.fc2(F.relu(self.fc1(x)))

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        attn_out = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))
        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, num_heads=8, d_ff=2048, num_layers=6, max_len=5000):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([TransformerBlock(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.out_linear = nn.Linear(d_model, vocab_size)
    
    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        mask = torch.triu(torch.ones(x.shape[1], x.shape[1]), diagonal=1)
        for layer in self.layers:
            x = layer(x, mask)
        return self.out_linear(x)

model = Transformer(1000, 32, 8, 64, 2, 5000)

print(model)

Transformer(
  (embedding): Embedding(1000, 32)
  (pos_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (attention): MultiHeadSelfAttention(
        (q_linear): Linear(in_features=32, out_features=32, bias=True)
        (k_linear): Linear(in_features=32, out_features=32, bias=True)
        (v_linear): Linear(in_features=32, out_features=32, bias=True)
        (out_linear): Linear(in_features=32, out_features=32, bias=True)
      )
      (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (ffn): FeedForward(
        (fc1): Linear(in_features=32, out_features=64, bias=True)
        (fc2): Linear(in_features=64, out_features=32, bias=True)
      )
      (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (out_linear): Linear(in_features=32, out_features=1000, bias=True)
)


### 5.2.数据集定义

In [13]:
class NLPDataset(Dataset):
    def __init__(self, size, length):
        self.data = []
        for i in range(size):
            self.data.append(torch.full((length, ), i))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

dataset = NLPDataset(12, 10)
for data in dataset:
    print(data)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5])
tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6])
tensor([7, 7, 7, 7, 7, 7, 7, 7, 7, 7])
tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8])
tensor([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])


### 5.3训练核心代码实现

In [1]:
def train(rank, world_size):
    VOCAB_SIZE = 100
    D_MODEL = 12
    NUM_HEADS = 4
    D_FF = 24
    NUM_LAYERS = 2
    MAX_LEN = 100

    DATASET_SIZE = 12
    DATASET_LENGTH = 10
    BATCH_SIZE = 2

    dataset = NLPDataset(size=DATASET_SIZE, length=DATASET_LENGTH)
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, sampler=sampler)

    backend = "nccl" if torch.cuda.is_available() else "gloo"
    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")

    dist.init_process_group(backend=backend, rank=rank, world_size=world_size)

    model = Transformer(
        vocab_size=VOCAB_SIZE,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        d_ff=D_FF,
        num_layers=NUM_LAYERS,
        max_len=MAX_LEN,
    ).to(device)
    model = DDP(model, device_ids=[rank] if torch.cuda.is_available() else None)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(200):
        for batch, data in enumerate(dataloader):
            label = data[:, 1:].to(device)
            data = data[:, :-1].to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs.view(-1, VOCAB_SIZE), label.view(-1))
            loss.backward()
            optimizer.step()

            if rank == 0:
                print(f"Epoch {epoch}, Batch {batch}, Loss: {loss.item()}")

    dist.destroy_process_group()

### 5.4启动训练

In [6]:
!bash run.sh

W0325 14:29:34.541000 218441 site-packages/torch/distributed/run.py:793] 
W0325 14:29:34.541000 218441 site-packages/torch/distributed/run.py:793] *****************************************
W0325 14:29:34.541000 218441 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0325 14:29:34.541000 218441 site-packages/torch/distributed/run.py:793] *****************************************
Epoch 0, Batch 0, Loss: 4.67319917678833
Epoch 0, Batch 1, Loss: 4.784787654876709
Epoch 0, Batch 2, Loss: 4.949789524078369
Epoch 1, Batch 0, Loss: 4.614452838897705
Epoch 1, Batch 1, Loss: 4.766570568084717
Epoch 1, Batch 2, Loss: 4.924740314483643
Epoch 2, Batch 0, Loss: 4.537784099578857
Epoch 2, Batch 1, Loss: 4.693002700805664
Epoch 2, Batch 2, Loss: 4.897914886474609
Epoch 3, Batch 0, Loss: 4.5058460235