# 开始训练
- 首先需要加载数据集（从本地加载与从 huggingface 加载的方式不一样!）

In [7]:
from diffusers import DiffusionPipeline
import torch

# 检查是否可以使用 MPS（Apple 的 GPU 加速）
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

print("检查 Mac mps 是否可用～～～")
print(torch.backends.mps.is_available())  # 检查 MPS 是否可用
print(torch.backends.mps.is_built())      # 检查 PyTorch 是否构建了 MPS 支持


# 从预训练模型加载 DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
    'model/diffsion_from_scratch.params', safety_checker=None
)


# 【工具类】获取调度器和分词器
scheduler = pipeline.scheduler # scheduler 是往图片中添加噪声的方法
tokenizer = pipeline.tokenizer

# 释放 pipeline 资源
del pipeline

# 打印设备、调度器和分词器
device, scheduler, tokenizer


Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


检查 Mac mps 是否可用～～～
True
True


  return torch.load(checkpoint_file, map_location="cpu")
The config attributes {'scaling_factor': 0.18215} were passed to AutoencoderKL, but are not expected and will be ignored. Please verify your config.json configuration file.
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


('mps',
 PNDMScheduler {
   "_class_name": "PNDMScheduler",
   "_diffusers_version": "0.12.1",
   "beta_end": 0.012,
   "beta_schedule": "scaled_linear",
   "beta_start": 0.00085,
   "clip_sample": false,
   "num_train_timesteps": 1000,
   "prediction_type": "epsilon",
   "set_alpha_to_one": false,
   "skip_prk_steps": true,
   "steps_offset": 1,
   "trained_betas": null
 },
 CLIPTokenizer(name_or_path='model/diffsion_from_scratch.params/tokenizer', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	49406: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
 	49407: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
 })

In [2]:
from datasets import load_dataset
import torchvision
import pandas as pd # 用于查看数据集
from PIL import Image
import io  # 确保导入 io 模块



# 加载宝可梦 parquet 数据集
# dataset = load_dataset(path='Datasets/lansinuotediffsion_from_scratch', split='train')
data_path = 'Datasets/lansinuotediffsion_from_scratch/train-00000-of-00001-4f5339e7acda17d8.parquet'
dataset = load_dataset('parquet', data_files=data_path, split='train')
df = pd.read_parquet(data_path)

# 查看前几行数据
print(df.head())

# 查看数据集的基本信息
print("👀", df.info())


# 图像增强模块
compose = torchvision.transforms.Compose([
    # 将图像调整为 512x512 像素，使用双线性插值
    torchvision.transforms.Resize(
        512, interpolation=torchvision.transforms.InterpolationMode.BILINEAR),
    # 从图像中心裁剪出 512x512 像素的部分
    torchvision.transforms.CenterCrop(512),
    # torchvison.transforms.RandomHorizontalFlip(),  # 可选：随机水平翻转图像
    # 将图像转换为张量格式
    torchvision.transforms.ToTensor(),
    # 对张量进行标准化，均值和标准差均为 0.5
    torchvision.transforms.Normalize([0.5], [0.5]),
])

# # 分别对数据集中的图片和文本进行编码
# def f(data):
#     # 应用图像增强
#     pixel_values = [compose(i) for i in data['image']]

#     # 文字编码
#     input_ids = tokenizer.batch_encode_plus(data['text'],
#                                             padding='max_length',
#                                             truncation=True,
#                                             max_length=77).input_ids

#     return {'pixel_values': pixel_values, 'input_ids': input_ids}

# 分别对数据集中的图片和文本进行编码
def f(data):
    # 🔥🔥 从字典中提取字节数据并转换为图像对象 => 从本地 dataset 加载才需要
    images = [Image.open(io.BytesIO(image_data['bytes'])) for image_data in data['image']]
    pixel_values = [compose(image) for image in images]

    # 文字编码
    input_ids = tokenizer.batch_encode_plus(data['text'],
                                            padding='max_length',
                                            truncation=True,
                                            max_length=77).input_ids

    return {'pixel_values': pixel_values, 'input_ids': input_ids}


# 对数据集应用编码函数 f
dataset = dataset.map(f,
                      batched=True,   # 启用批处理
                      batch_size=100, # 每次处理 100 个样本
                      num_proc=1,     # 使用 1 个进程
                      remove_columns=['image', 'text'])  # 移除原始的图像和文本列

# 设置数据集格式为 PyTorch
dataset.set_format(type='torch')

# 返回处理后的数据集和第一个样本
dataset, dataset[0]


                                               image  \
0  {'bytes': b'\xff\xd8\xff\xe1#\rExif\x00\x00MM\...   
1  {'bytes': b'\xff\xd8\xff\xe1#\x98Exif\x00\x00M...   
2  {'bytes': b'\xff\xd8\xff\xe1\x18\x82Exif\x00\x...   
3  {'bytes': b'\xff\xd8\xff\xe1\x1a\x06Exif\x00\x...   
4  {'bytes': b'\xff\xd8\xff\xe1\x1d\xe3Exif\x00\x...   

                                                text  
0         a drawing of a green pokemon with red eyes  
1             a green and yellow toy with a red nose  
2  a red and white ball with an angry look on its...  
3           a cartoon ball with a smile on it's face  
4          a bunch of balls with faces drawn on them  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833 entries, 0 to 832
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   833 non-null    object
 1   text    833 non-null    object
dtypes: object(2)
memory usage: 13.1+ KB
👀 None


(Dataset({
     features: ['pixel_values', 'input_ids'],
     num_rows: 833
 }),
 {'pixel_values': tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           ...,
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.]],
  
          [[1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           ...,
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.]],
  
          [[1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           ...,
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.]]]),
  'input_ids': tensor([49406,   320,  3610,   539,   320,  1901,  9528

In [3]:
# 定义loader - 批量加载数据
# 将数据集分成更小的批次（batch），每次处理一批数据。这样，模型在每次迭代时只需处理一部分数据，减少内存占用，并且可以更快地进行训练
def collate_fn(data):
    pixel_values = [i['pixel_values'] for i in data]
    input_ids = [i['input_ids'] for i in data]

    pixel_values = torch.stack(pixel_values).to(device)
    input_ids = torch.stack(input_ids).to(device)

    return {'pixel_values': pixel_values, 'input_ids': input_ids}


loader = torch.utils.data.DataLoader(dataset,
                                     shuffle=True,
                                     collate_fn=collate_fn,
                                     batch_size=1)

len(loader), next(iter(loader))

(833,
 {'pixel_values': tensor([[[[1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            ...,
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.]],
  
           [[1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            ...,
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.]],
  
           [[1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            ...,
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.]]]], device='mps:0'),
  'input_ids': tensor([[49406,   320,  7651,  6575,   593,   902,  4932,  3184,  1488, 49407,
           49

In [6]:
# jupiter 加载模型的命令, 加载之前生成的几个模型
%run 1.encoder.ipynb
%run 2.vae.ipynb
%run 3.unet.ipynb

# 准备训练 (只训练 U-Net, 固定 VAE 和 Encoder, 因此下面就冻结其他两个模型的参数)
# 冻结编码器模型的所有参数，使其在训练时不更新
encoder.requires_grad_(False)
# 冻结 VAE 模型的所有参数，使其在训练时不更新
vae.requires_grad_(False)
# 使 U-Net 模型的参数可训练，即在训练时更新
unet.requires_grad_(True)

# 设置编码器和 VAE 为评估模式 (即冻结它们的行为，防止 BatchNorm 和 Dropout 等操作影响)
encoder.eval()
vae.eval()
# 设置 U-Net 为训练模式（启用 BatchNorm 和 Dropout 等训练时特有的操作）
unet.train()

# 将模型移到指定的设备（CPU 或 GPU）
encoder.to(device)
vae.to(device)
unet.to(device)

# 设置优化器，只优化 U-Net 的参数
# 使用 AdamW 优化器，学习率为 1e-5，beta1 和 beta2 是动量项的衰减率，weight_decay 是权重衰减系数，eps 是为数值稳定性添加的小数值
optimizer = torch.optim.AdamW(unet.parameters(),
                              lr=1e-5,
                              betas=(0.9, 0.999),
                              weight_decay=0.01,
                              eps=1e-8)

# 使用均方误差 (MSE) 作为损失函数，用于衡量模型预测与目标之间的差异
criterion = torch.nn.MSELoss()

# 输出优化器和损失函数的配置
optimizer, criterion

['.DS_Store', 'diffsion_from_scratch.params', 'diffsion_from_scratch.unet']
✅ Model loaded successfully!
tensor([[[-0.3488,  0.0139, -0.0409,  ..., -0.4707, -0.2910,  0.0627],
         [ 0.6009, -0.4915,  1.0705,  ...,  0.0032,  0.5970, -0.4605],
         [ 0.5848, -1.8402,  0.6390,  ...,  0.3736,  0.1611,  1.0529],
         ...,
         [ 0.7383, -0.1099,  1.2613,  ...,  0.2626, -0.2641,  0.3401],
         [ 1.1845, -0.1865,  1.5217,  ...,  0.2758,  0.1133,  0.1809],
         [ 0.9668, -0.5271,  1.4090,  ..., -0.0710,  0.1474, -0.2603]]],
       grad_fn=<NativeLayerNormBackward0>)
tensor([[[-0.3488,  0.0139, -0.0409,  ..., -0.4707, -0.2910,  0.0627],
         [ 0.6009, -0.4915,  1.0705,  ...,  0.0032,  0.5970, -0.4605],
         [ 0.5848, -1.8402,  0.6390,  ...,  0.3736,  0.1611,  1.0529],
         ...,
         [ 0.7383, -0.1099,  1.2613,  ...,  0.2626, -0.2641,  0.3401],
         [ 1.1845, -0.1865,  1.5217,  ...,  0.2758,  0.1133,  0.1809],
         [ 0.9668, -0.5271,  1.4090,  ...

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
The config attributes {'scaling_factor': 0.18215} were passed to AutoencoderKL, but are not expected and will be ignored. Please verify your config.json configuration file.
  return torch.load(checkpoint_file, map_location="cpu")


['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_attention_op', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_s

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


(AdamW (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 1e-05
     maximize: False
     weight_decay: 0.01
 ),
 MSELoss())

In [9]:
# 如果模型预测的结果和真实答案非常接近，loss 就会很小；如果预测结果和真实答案相差很远，loss 就会很大
def get_loss(data): # 入参是一批【图片数据】和【文本数据】
    with torch.no_grad():
        #文字编码
        #[1, 77] -> [1, 77, 768]
        out_encoder = encoder(data['input_ids'])

        #抽取图像特征图
        #[1, 3, 512, 512] -> [1, 4, 64, 64]
        out_vae = vae.encoder(data['pixel_values'])
        out_vae = vae.sample(out_vae)

        #0.18215 = vae.config.scaling_factor
        out_vae = out_vae * 0.18215

    # 随机数,unet的计算目标
    noise = torch.randn_like(out_vae)


    # 往特征图中添加噪声
    #1000 = scheduler.num_train_timesteps
    #1 = batch size
    # 添加随机噪声
    noise_step = torch.randint(0, 1000, (1, )).long().to(device)
    out_vae_noise = scheduler.add_noise(out_vae, noise, noise_step)

    # 根据文字信息,把特征图中的噪声计算出来
    out_unet = unet(out_vae=out_vae_noise,
                    out_encoder=out_encoder,
                    time=noise_step)

    # 计算 mse loss
    #[1, 4, 64, 64],[1, 4, 64, 64]
    return criterion(out_unet, noise)


get_loss({
    'input_ids': torch.ones(1, 77, device=device).long(),
    'pixel_values': torch.randn(1, 3, 512, 512, device=device)
})

tensor(0.0096, device='mps:0', grad_fn=<MseLossBackward0>)

In [11]:
import os

# 👇👇 开始训练
def train():
    loss_sum = 0
    # 🚀🚀 训练 400 个 epoch
    for epoch in range(400):
        # 遍历每一个 batch 的数据
        for i, data in enumerate(loader):
            # 计算损失并做反向传播（loss.backward），但不立即更新模型参数
            loss = get_loss(data) / 4  # 每 4 个批次做一次参数的调整
            loss.backward()
            loss_sum += loss.item()

            # 每 4 个批次更新一次模型参数
            if (epoch * len(loader) + i) % 4 == 0:
                # 梯度裁剪，防止梯度爆炸
                torch.nn.utils.clip_grad_norm_(unet.parameters(), 1.0)
                # 更新模型参数
                optimizer.step()
                # 清零优化器的梯度
                optimizer.zero_grad()

        # 每 10 个 epoch 输出一次累计的损失值并重置
        if epoch % 10 == 0:
            print(epoch, loss_sum)
            loss_sum = 0

    # 训练结束后保存模型到指定路径
    # 检查目录是否存在，如果不存在则创建
    os.makedirs('saves', exist_ok=True)
    torch.save(unet.to('cpu'), 'saves/unet.model')


train()


0 10.482576073292876
10 103.78535157235456
20 101.80729839394917
30 97.18179716335726
40 95.27005371931591
50 93.52235668250069
60 91.23645739001222
70 85.15300659097556
80 86.9126384588817
90 81.5698299240612
100 78.39926671008288
110 75.35619182675873
120 74.10794699968392
130 71.74995687346382
140 68.43087464362179
150 63.701912391486985
160 64.13840378901659
170 59.5763320050537
180 58.09928199512069
190 54.098192290555744
200 53.715583391978726
210 49.144029857572605
220 47.92628015137598
230 45.43078308462282
240 42.47678213690597
250 42.57540527023593
260 38.39479267646675
270 36.950618647257215
280 37.183194903125695
290 33.844317765288
300 32.76033932725841
310 30.12229502759874
320 30.798989617975167
330 30.148435787028575
340 27.67200335358939
350 27.441537938197143
360 26.668230306680925
370 26.568718082347914
380 24.796474545077217
390 24.26365733845887


RuntimeError: Parent directory saves does not exist.

In [None]:
from transformers import PreTrainedModel, PretrainedConfig

# 包装类
class Model(PreTrainedModel):
    # 定义配置类，这里使用预训练配置
    config_class = PretrainedConfig

    def __init__(self, config):
        # 初始化父类的构造函数
        super().__init__(config)
        # 将 U-Net 模型加载到 CPU 上并作为类的一个属性
        self.unet = unet.to('cpu')

# 👉 保存模型到 Hugging Face Hub
Model(PretrainedConfig()).push_to_hub(
    # 指定要上传到 Hugging Face Hub 的模型库 ID
    repo_id='lansinuote/diffsion_from_scratch.unet',
    # 使用从文件中读取的认证令牌进行授权
    use_auth_token=open('/root/hub_token.txt').read().strip()
)

