### 自定义 Dataset

In [4]:
import torch
from torch.utils.data import Dataset

# 自定义数据集类
class MyDataset(Dataset):
    '''
    初始化数据集 X 和 Y 是两个数据集 或数组
    X: 输入特征
    Y: 目标标签
    '''
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx])
        y = torch.tensor(self.Y[idx])
        return x, y

X = [[1,2],[3,4],[5,6],[7,8]]
Y = [1, 0, 4, 5]
dataset = MyDataset(X, Y)
print(dataset.__len__())
print(dataset.__getitem__(1))
print(dataset.__getitem__(2))
print(dataset.__getitem__(3))

4
(tensor([3, 4]), tensor(0))
(tensor([5, 6]), tensor(4))
(tensor([7, 8]), tensor(5))


### 使用 DataLoader 加载数据

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class DataSetTest(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        return sample, label

data = torch.randn(100, 5) # 100 个样本, 每个样本有 5个特征
labes = torch.randint(0, 2, (100,)) # 100 个标签， 取值为 0，1

dataset = DataSetTest(data, labes)

print("数据集大小 {}", dataset.__len__())
print("第 0 个 样本 {}", dataset.__getitem__(0))
print("第 1 个岩本样本 {}", dataset.__getitem__(1))

数据集大小 {} 100
第 0 个 样本 {} (tensor([ 1.3002, -0.1806, -0.4974, -0.7626,  0.8662]), tensor(0))
第 1 个岩本样本 {} (tensor([ 0.2172, -0.2023,  1.5252,  0.5274, -0.0121]), tensor(0))


DataLoader 是 PyTorch 提供的数据加载器，用于批量加载数据集
1. 批量加载：通过设置 batch_size。
2. 数据打乱：通过设置 shuffle=True。
3. 多线程加速：通过设置 num_workers。
4. 迭代访问：方便地按批次访问数据。

In [2]:
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

for batch_idx, (batch_data, batch_labels) in enumerate(dataloader):
    print(f"批次 {batch_idx + 1}")
    print("数据:", batch_data)
    print("标签:", batch_labels)
    if batch_idx == 2:  # 仅显示前 3 个批次
        break

批次 1
数据: tensor([[ 0.4088, -0.0193, -2.5040, -0.7485, -0.1598],
        [ 0.5485, -0.7560, -0.1211,  1.3577,  0.6904],
        [-0.7922, -0.1496, -0.2619,  1.2570, -1.0274],
        [ 1.3522,  0.2476,  1.2301,  0.0330,  2.6778],
        [-0.1759,  1.2569, -0.0472,  0.8821, -1.2578],
        [ 1.3820, -0.2695,  0.9457, -1.0306, -0.2549],
        [-0.8625, -0.5142,  1.1555, -1.2188, -0.4447],
        [ 0.1385,  0.2428,  0.4461, -0.5696,  0.5841],
        [ 0.4559,  0.1202, -0.3654,  0.9894, -1.1270],
        [ 0.7394,  0.3705,  2.1455,  1.4369,  1.6942]])
标签: tensor([0, 0, 0, 1, 0, 0, 1, 0, 0, 1])
批次 2
数据: tensor([[ 0.4967, -0.4638,  0.6870,  0.0507,  0.1016],
        [ 0.3237, -1.9158,  0.6137,  1.3116, -0.8802],
        [ 0.0528, -0.6937,  0.5170, -1.1847, -0.3016],
        [ 1.4723,  0.5431, -0.0420, -1.1302, -1.5780],
        [ 0.2952,  1.8302,  0.3652, -0.6943, -1.4074],
        [ 0.5288,  1.5299,  0.2080,  0.7779,  0.9117],
        [ 0.4510, -1.3475,  1.7125, -0.6555,  1.5589],
   

In [3]:
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# 定义数据预处理
transform = transforms.Compose([
    transforms.ToTensor(),  # 转换为张量
    transforms.Normalize((0.5,), (0.5,))  # 标准化
])

# 加载训练数据集
train_dataset = torchvision.datasets.MNIST(
    root='./data', train=True, transform=transform, download=True)

# 使用 DataLoader 加载数据
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 查看一个批次的数据
data_iter = iter(train_loader)
images, labels = next(data_iter)
print(f"批次图像大小: {images.shape}")  # 输出形状为 [batch_size, 1, 28, 28]
print(f"批次标签: {labels}")

100%|██████████| 9.91M/9.91M [00:09<00:00, 1.08MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 164kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 836kB/s] 
100%|██████████| 4.54k/4.54k [00:00<00:00, 1.85MB/s]

批次图像大小: torch.Size([32, 1, 28, 28])
批次标签: tensor([2, 3, 8, 6, 0, 9, 3, 4, 5, 1, 0, 6, 7, 6, 9, 6, 5, 9, 6, 3, 6, 0, 6, 4,
        8, 5, 7, 7, 8, 2, 6, 8])



