## **pip install**

In [None]:
%pip -q install --upgrade pip

# 1) PyTorch + CUDA (Colab의 12.x 환경용)
%pip -q install torch torchvision --index-url https://download.pytorch.org/whl/cu121

# 2) Stable Diffusion 핵심
%pip -q install diffusers[torch] transformers accelerate safetensors

# 3) 텍스트 토크나이저/전처리(권장)
%pip -q install sentencepiece ftfy

# 4) 메모리/속도 최적화(권장)
%pip -q install xformers

# 5) 선택: 저용량 옵티마이저(필요할 때만)
# %pip -q install bitsandbytes

# 6) 선택: 이미지 유틸(저장/가공 편의)
%pip -q install pillow opencv-python einops

# 7) 선택: 지각적 손실 추가
%pip -q install lpips

## **데이터 학습파일 압축 풀기**

In [None]:
!unzip -q "/content/drive/MyDrive/Stable Diffusion/ceramic.zip" -d "/content/"

## **import문**

In [None]:
# PyTorch 기본
import torch
import torch.nn as nn
import torch.nn.functional as F

# Hugging Face diffusers / Stable Diffusion 관련
from diffusers import StableDiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler

# 텍스트 토크나이징 & 전처리
from transformers import CLIPTokenizer, CLIPTextModel

# 실행 최적화
import accelerate
import xformers

# 유틸리티
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import os

# (선택) 진행률 표시
from tqdm.auto import tqdm

## **런타임 확인**

In [None]:
import torch, diffusers, transformers, accelerate, safetensors, xformers
print("CUDA available:", torch.cuda.is_available())
print("Torch:", torch.__version__)
print("Diffusers:", diffusers.__version__)
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)

## **def**

In [None]:
def kl_loss(mu, logvar, reduction="mean"):
    kl = 0.5 * (mu.pow(2) + logvar.exp() - 1.0 - logvar)
    kl = kl.sum(dim=[1, 2, 3])
    if reduction == "mean":
        return kl.mean()
    elif reduction == "sum":
        return kl.sum()
    else:
        return kl

In [None]:
def pil_to_tensor(image_pil, size=512, device=None, dtype=torch.float32):
    """ PIL -> torch tensor in [-1,1], shape [1,3,H,W] """
    if image_pil.mode != "RGB":
        image_pil = image_pil.convert("RGB")
    image_pil = image_pil.resize((size, size), Image.LANCZOS)
    arr = np.array(image_pil).astype(np.float32) / 255.0
    arr = arr * 2.0 - 1.0  # [0,1] -> [-1,1]
    t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0)
    if device is not None:
        t = t.to(device=device, dtype=dtype)
    else:
        t = t.to(dtype=dtype)
    return t

In [None]:
def tensor_to_pil(x):
    """ torch tensor in [-1,1], [B,3,H,W] -> list[PIL] """
    x = (x.clamp(-1,1) + 1) / 2
    x = (x * 255).byte().cpu().permute(0,2,3,1).numpy()
    return [Image.fromarray(img) for img in x]

## **클래스**

#### **인코더 (VAE)**

In [None]:
SCALE = 0.18215

class VAEEncoder(nn.Module):
    """
    입력:  x  ∈ [-1,1], [B,3,512,512]
    출력:  z  = (mu + sigma * eps) * 0.18215  → [B,4,64,64]
           mu, logvar ∈ [B,4,64,64]
    """
    def __init__(self, in_channels=3, latent_channels=4, base=64, scale_factor=SCALE):
        super().__init__()
        self.scale_factor = scale_factor

        # 512 -> 512
        self.block1 = nn.Sequential(
            nn.Conv2d(in_channels, base, kernel_size=3, stride=1, padding=1),
            nn.SiLU(),
            nn.Conv2d(base, base, kernel_size=3, stride=1, padding=1),
            nn.SiLU(),
        )
        # 512 -> 256
        self.down1 = nn.Sequential(
            nn.Conv2d(base, base*2, kernel_size=4, stride=2, padding=1),
            nn.SiLU(),
        )
        # 256 -> 128
        self.down2 = nn.Sequential(
            nn.Conv2d(base*2, base*4, kernel_size=4, stride=2, padding=1),
            nn.SiLU(),
        )
        # 128 -> 64
        self.down3 = nn.Sequential(
            nn.Conv2d(base*4, base*8, kernel_size=4, stride=2, padding=1),
            nn.SiLU(),
        )
        # 64 -> 64 (feature refine)
        self.block2 = nn.Sequential(
            nn.Conv2d(base*8, base*8, kernel_size=3, stride=1, padding=1),
            nn.SiLU(),
            nn.Conv2d(base*8, base*8, kernel_size=3, stride=1, padding=1),
            nn.SiLU(),
        )

        # latent heads
        self.to_mu     = nn.Conv2d(base*8, latent_channels, kernel_size=3, stride=1, padding=1)
        self.to_logvar = nn.Conv2d(base*8, latent_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x, sample=True):
        """
        x: [B,3,512,512] in [-1,1]
        return: z_scaled, mu, logvar
        """
        h = self.block1(x)   # 512
        h = self.down1(h)    # 256
        h = self.down2(h)    # 128
        h = self.down3(h)    # 64
        h = self.block2(h)   # 64

        mu     = self.to_mu(h)
        logvar = self.to_logvar(h)

        if sample:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            z = mu + eps * std
        else:
            z = mu

        # Stable Diffusion 호환 스케일 적용
        z = z * self.scale_factor
        return z, mu, logvar

VAE 인코더는 이런 느낌으로 작동하는거 같음

SiLU : Sigmoid Linear Unit의 줄임말로, 요즘 딥러닝에서 자주 쓰이는 활성화 함수. 다른 이름으로는 Swish라고도 부름.

### **디코더 (DCGAN)**

In [None]:
class DCGANGenerator(nn.Module):
    def __init__(self, latent_dim=4, ngf=64, out_channels=3, expect_scaled_latent=True, scale_factor=0.18215):
        """
        latent_dim: SD 호환 잠재 채널(=4)
        expect_scaled_latent: True면 입력 z가 0.18215로 스케일되어 있다고 가정하고 내부에서 나눕니다.
        """
        super().__init__()
        self.expect_scaled_latent = expect_scaled_latent
        self.scale_factor = scale_factor

        self.main = nn.Sequential(
            # 64x64  → 128x128
            nn.ConvTranspose2d(latent_dim, ngf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 8), nn.ReLU(True),

            # 128x128 → 256x256
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4), nn.ReLU(True),

            # 256x256 → 512x512
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2), nn.ReLU(True),

            # 마무리 합성곱 (세밀함 보강)
            nn.Conv2d(ngf * 2, ngf, 3, 1, 1, bias=False),
            nn.BatchNorm2d(ngf), nn.ReLU(True),

            nn.Conv2d(ngf, out_channels, 3, 1, 1, bias=False),
            nn.Tanh()
        )

    def forward(self, z):
        # SD 규칙: decode 시 latent / 0.18215
        if self.expect_scaled_latent and self.scale_factor is not None:
            z = z / self.scale_factor
        return self.main(z)


device = "cuda" if torch.cuda.is_available() else "cpu"
gen = DCGANGenerator().to(device).float().eval()   # ★ FP32로 고정
print("gen dtype:", next(gen.parameters()).dtype)

gen dtype: torch.float32


### **DCGAN 손실 (이건 뭐지)**

In [None]:
class DCGANDiscriminator(nn.Module):
    def __init__(self, in_channels=3, ndf=64):
        super().__init__()
        self.main = nn.Sequential(
            # 512 → 256
            nn.Conv2d(in_channels, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),

            # 256 → 128
            nn.Conv2d(ndf, ndf*2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf*2), nn.LeakyReLU(0.2, inplace=True),

            # 128 → 64
            nn.Conv2d(ndf*2, ndf*4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf*4), nn.LeakyReLU(0.2, inplace=True),

            # 64 → 32
            nn.Conv2d(ndf*4, ndf*8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf*8), nn.LeakyReLU(0.2, inplace=True),

            # 32 → 16
            nn.Conv2d(ndf*8, ndf*16, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf*16), nn.LeakyReLU(0.2, inplace=True),

            # 16 → 1x1 score
            nn.Conv2d(ndf*16, 1, 4, 1, 0, bias=False)
        )

    def forward(self, x):
        # 출력 모양: [B,1,1,1]
        return self.main(x).view(-1)


### **BasicBlock**

In [None]:
class BasicBlock(nn.Module):
    expansion = 1  # ResNet18/34는 1

    def __init__(self, in_planes, planes, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, 3, stride, 1, bias=False)
        self.bn1   = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, 3, 1, 1, bias=False)
        self.bn2   = nn.BatchNorm2d(planes)

        self.downsample = None
        if stride != 1 or in_planes != planes * self.expansion:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes * self.expansion, 1, stride, bias=False),
                nn.BatchNorm2d(planes * self.expansion),
            )

        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self.downsample is not None:
            identity = self.downsample(identity)
        out = self.relu(out + identity)
        return out

### **ResNet18**

In [None]:
class ResNet18Binary(nn.Module):
    def __init__(self, num_classes=1):  # 1 로짓 (BCEWithLogitsLoss)
        super().__init__()
        self.in_planes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1   = nn.BatchNorm2d(64)
        self.relu  = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(64,  2, stride=1)  # [2,2,2,2]
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512 * BasicBlock.expansion, num_classes)

    def _make_layer(self, planes, blocks, stride):
        layers = [BasicBlock(self.in_planes, planes, stride)]
        self.in_planes = planes * BasicBlock.expansion
        for _ in range(1, blocks):
            layers.append(BasicBlock(self.in_planes, planes, stride=1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)

        x = self.layer1(x); x = self.layer2(x)
        x = self.layer3(x); x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        logits = self.fc(x)  # shape: [B,1]
        return logits

## **인코딩 (GAN 방식일 때 필요할 듯)**

In [None]:
encoder = VAEEncoder(latent_channels=4).to("cuda").eval()

# 가짜 입력(batch=1)
x = torch.randn(1, 3, 512, 512, device="cuda")

with torch.no_grad():
    z, mu, logvar = encoder(x, sample=True)

print("z shape:", z.shape)        # [1, 4, 64, 64]
print("mu/logvar:", mu.shape, logvar.shape)

z shape: torch.Size([1, 4, 64, 64])
mu/logvar: torch.Size([1, 4, 64, 64]) torch.Size([1, 4, 64, 64])


## **디코딩**

### **DCGAN 디코더 (학습 필요)**

In [None]:
# DCGAN 디코더 초기화
dcgan_decoder = DCGANGenerator(latent_dim=4).to("cuda")
dcgan_decoder.eval()

# Stable Diffusion latent 생성 부분 (예시)
pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16
).to("cuda")

prompt = "dog, ultra detailed"
with torch.no_grad():
    latents = pipe(prompt, output_type="latent").images  # latents 형태: [B,4,64,64]

    latents = latents.to(device=device, dtype=torch.float32)
    print("latents dtype:", latents.dtype)

    # DCGAN으로 디코딩
    decoded = dcgan_decoder(latents)  # 출력: [B,3,256,256]

    # PIL 이미지 변환
    imgs = (decoded.clamp(-1, 1) + 1) / 2
    imgs = imgs.mul(255).byte().cpu().permute(0, 2, 3, 1).numpy()

    pil_images = [Image.fromarray(img) for img in imgs]
    pil_images[0].show()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

safety_checker/model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

latents dtype: torch.float32


### **VAE 디코더 아키텍처 구현 + 가중치 불러오기**

In [None]:
MODEL_ID = "runwayml/stable-diffusion-v1-5"  # SD 1.x 계열
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

def _get_vae_config(model_id=MODEL_ID):
    """
    diffusers >=0.20: AutoencoderKL.load_config 사용 가능.
    하위 버전 호환을 위해 from_pretrained로 config만 가져오는 폴백 포함.
    """
    cfg = None
    if hasattr(AutoencoderKL, "load_config"):
        try:
            cfg = AutoencoderKL.load_config(model_id, subfolder="vae")
        except Exception:
            cfg = None
    if cfg is None:
        # weights까지 받아오지만 config만 복제해 사용
        _tmp = AutoencoderKL.from_pretrained(model_id, subfolder="vae")
        cfg = _tmp.config
        del _tmp
    return cfg

class SDVAEDecoder(nn.Module):
    """
    Stable Diffusion VAE 디코더와 '완전히 동일한 구조'의 디코더 래퍼.
    - post_quant_conv + decoder 로 구성 (AutoencoderKL와 동일)
    - scaling_factor(기본 0.18215) 처리 포함
    """
    def __init__(self, cfg=None, scaling_factor=None):
        super().__init__()
        if cfg is None:
            cfg = _get_vae_config(MODEL_ID)
        # 동일 아키텍처의 전체 VAE를 '구성만' 복제
        vae_like = AutoencoderKL.from_config(cfg)
        # 디코더에 필요한 서브모듈만 보존
        self.post_quant_conv = vae_like.post_quant_conv
        self.decoder         = vae_like.decoder

        # SD 스케일링 팩터 (config에 있으면 그대로)
        self.scaling_factor = (
            scaling_factor if scaling_factor is not None
            else getattr(cfg, "scaling_factor", 0.18215)
        )

    @torch.no_grad()
    def forward(self, z_scaled, return_dict=False):
        """
        입력:
          z_scaled: [B,4,64,64]  (SD latent, 이미 × scaling_factor 적용된 것)
        출력:
          x: [-1,1] 범위의 [B,3,512,512]
        """
        # 언스케일 → post_quant_conv → decoder
        z = z_scaled / z_scaled.new_tensor(self.scaling_factor)
        z = self.post_quant_conv(z)
        dec_out = self.decoder(z)
        # diffusers Decoder는 DecoderOutput(sample=...)을 반환
        x = dec_out.sample if hasattr(dec_out, "sample") else dec_out
        if return_dict:
            return {"sample": x}
        return x

# 아키텍처만 동일한 디코더 인스턴스 (아직 가중치 로드 전)
decoder_same = SDVAEDecoder().to(DEVICE).eval()
print("Decoder ready (architecture matched).")
print("scaling_factor:", decoder_same.scaling_factor)

# 1) 원본 VAE에서 디코더 관련 가중치만 추출
print("[1/3] Fetching VAE weights from:", MODEL_ID)
vae_ref = AutoencoderKL.from_pretrained(
    MODEL_ID, subfolder="vae", torch_dtype=torch.float16
).eval()  # CPU 로드도 OK, 어차피 state_dict만 씀
sd_full = vae_ref.state_dict()

decoder_sd = {k: v.cpu() for k, v in sd_full.items()
              if k.startswith("post_quant_conv.") or k.startswith("decoder.")}

print(f" - extracted {len(decoder_sd)} decoder keys")

# 2) 동일 아키텍처 디코더 인스턴스에 주입
#    (decoder_same 은 이전 셀에서 만든 SDVAEDecoder 인스턴스라고 가정)
decoder_same = decoder_same.to(DEVICE)  # ensure on device
missing, unexpected = decoder_same.load_state_dict(decoder_sd, strict=False)
print("[2/3] load_state_dict(strict=False)")
print(" - missing keys   :", len(missing), ("(ok: encoder-side keys are not needed)" if missing else ""))
print(" - unexpected keys:", len(unexpected))

# 3) dtype/device 정리
#    fp16 추론 권장(속도/메모리), 필요시 fp32 유지 가능
decoder_same = decoder_same.to(dtype=torch.float16 if DEVICE=="cuda" else torch.float32).eval()
print("[3/3] decoder dtype:", next(decoder_same.parameters()).dtype, " device:", DEVICE)

# ---- quick sanity decode (uses `latents` if available) ----

def to_pil_batch(x):
    x01 = (x.clamp(-1,1) + 1) / 2
    x_u8 = (x01 * 255).byte().cpu().permute(0,2,3,1).numpy()
    return [Image.fromarray(arr) for arr in x_u8]

with torch.no_grad():
    # 라틴트가 이미 있으면 그걸 사용, 없으면 더미로 모양만 테스트
    if 'latents' in globals():
        z = latents.to(DEVICE, dtype=next(decoder_same.parameters()).dtype)
    else:
        z = torch.randn(1, 4, 64, 64, device=DEVICE, dtype=next(decoder_same.parameters()).dtype)
        print("(!) `latents` not found; using random latent for shape check")

    # autocast는 fp16에서만 켜기
    use_amp = (DEVICE=="cuda" and next(decoder_same.parameters()).dtype==torch.float16)
    ctx = torch.autocast(device_type="cuda", enabled=use_amp)

    with ctx:
        x = decoder_same(z)  # [-1,1], [B,3,512,512]

imgs = to_pil_batch(x)
out_path = "/content/vae_decoder_loaded.png"
imgs[0].save(out_path)
print("Saved:", out_path, " | shape:", x.shape)


Decoder ready (architecture matched).
scaling_factor: 0.18215
[1/3] Fetching VAE weights from: runwayml/stable-diffusion-v1-5
 - extracted 140 decoder keys
[2/3] load_state_dict(strict=False)
 - missing keys   : 0 
 - unexpected keys: 0
[3/3] decoder dtype: torch.float16  device: cuda
Saved: /content/vae_decoder_loaded.png  | shape: torch.Size([1, 3, 512, 512])


In [None]:
# === 프롬프트 → UNet 확산으로 latent 50장 생성 → decoder_same로 복원 → matplotlib 그리드 ===
import math
import torch
import matplotlib.pyplot as plt
from diffusers import StableDiffusionPipeline

# 0) 설정
prompt = "one simple circle plate, ceramic, ultra detailed"
num_total  = 1200
batch_size = 16
steps      = 30
guidance   = 7.5
base_seed  = None  # 매 실행 동일 결과 원하면 고정; 다양하게 뽑으려면 None로 두고 아래 시드 생성부분 생략

# 1) 파이프 준비 (한 번만 로드)
if 'pipe' not in globals():
    pipe = StableDiffusionPipeline.from_pretrained(
        MODEL_ID,
        torch_dtype=(torch.float16 if DEVICE == "cuda" else torch.float32),
        safety_checker=None
    ).to(DEVICE)
    pipe.set_progress_bar_config(disable=True)

# 2) 50개 latent를 5장씩 생성 (output_type='latent'로 VAE 전 단계 latent 획득)
latents_list = []
with torch.no_grad():
    n_batches = math.ceil(num_total / batch_size)
    for b in range(n_batches):
        # (선택) 시드 고정: 각 이미지마다 다른 시드
        # if base_seed is not None:
        #     gens = []
        #     for k in range(batch_size):
        #         g = torch.Generator(device=DEVICE)
        #         g.manual_seed(base_seed + b*batch_size + k)
        #         gens.append(g)
        # else:
        #     gens = None  # 무작위

        out = pipe(
            prompt=[prompt]*batch_size,
            num_inference_steps=steps,
            guidance_scale=guidance,
            num_images_per_prompt=1,
            output_type="latent",
            # generator=gens
        )
        # diffusers는 output_type='latent'일 때, latents를 out.images에 담아 반환
        z_scaled_batch = out.images  # shape: [B, 4, 64, 64], 이미 scaling_factor가 곱해진 공간
        latents_list.append(z_scaled_batch)

# 3) 디코더로 복원 (당신의 decoder_same은 z_scaled 입력을 기대함)
imgs = []
use_amp = (DEVICE=="cuda" and next(decoder_same.parameters()).dtype==torch.float16)
with torch.no_grad(), torch.autocast(device_type="cuda", enabled=use_amp):
    for z_scaled in latents_list:
        # 파이프 출력 dtype/device와 디코더 파라미터 dtype/device를 맞춰줌
        z_scaled = z_scaled.to(DEVICE, dtype=next(decoder_same.parameters()).dtype, non_blocking=True)
        x = decoder_same(z_scaled)  # [-1,1], [B,3,512,512]
        imgs.extend(to_pil_batch(x))

# 4) matplotlib 그리드로 보기 (5행 x 10열)
cols = 10
rows = math.ceil(len(imgs) / cols)
plt.figure(figsize=(20, 2*rows))
for i, img in enumerate(imgs[:num_total]):
    plt.subplot(rows, cols, i+1)
    plt.imshow(img)
    plt.axis("off")
plt.tight_layout()
plt.show()

print(f"샘플 개수: {len(imgs)} | steps={steps}, guidance={guidance}, seed_base={base_seed}")

import os
save_dir = "/content/generated"
os.makedirs(save_dir, exist_ok=True)

for i, img in enumerate(imgs):
    img.save(os.path.join(save_dir, f"sample_{i:03d}.png"))
print("개별 저장 완료:", len(imgs), "장")

개인이 가중치 없이 직접 학습하는건 너무 비현실적.

따라서 디코더 아키텍처를 구현 후 사전학습된 가중치를 주입해서 작동하도록 구현.

간단하게 구현한다면 그냥 모델로 가져와서 프롬프트 넣고 실행도 가능.

### **VAE 아키텍처 구현 (가중치 없음)**

In [None]:
MODEL_ID = "runwayml/stable-diffusion-v1-5"  # SD 1.x 계열
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

def _get_vae_config(model_id=MODEL_ID):
    """
    diffusers >=0.20: AutoencoderKL.load_config 사용 가능.
    하위 버전 호환을 위해 from_pretrained로 config만 가져오는 폴백 포함.
    """
    cfg = None
    if hasattr(AutoencoderKL, "load_config"):
        try:
            cfg = AutoencoderKL.load_config(model_id, subfolder="vae")
        except Exception:
            cfg = None
    if cfg is None:
        # weights까지 받아오지만 config만 복제해 사용
        _tmp = AutoencoderKL.from_pretrained(model_id, subfolder="vae")
        cfg = _tmp.config
        del _tmp
    return cfg

class SDVAEDecoder(nn.Module):
    """
    Stable Diffusion VAE 디코더와 '완전히 동일한 구조'의 디코더 래퍼.
    - post_quant_conv + decoder 로 구성 (AutoencoderKL와 동일)
    - scaling_factor(기본 0.18215) 처리 포함
    """
    def __init__(self, cfg=None, scaling_factor=None):
        super().__init__()
        if cfg is None:
            cfg = _get_vae_config(MODEL_ID)
        # 동일 아키텍처의 전체 VAE를 '구성만' 복제
        vae_like = AutoencoderKL.from_config(cfg)
        # 디코더에 필요한 서브모듈만 보존
        self.post_quant_conv = vae_like.post_quant_conv
        self.decoder         = vae_like.decoder

        # SD 스케일링 팩터 (config에 있으면 그대로)
        self.scaling_factor = (
            scaling_factor if scaling_factor is not None
            else getattr(cfg, "scaling_factor", 0.18215)
        )

    @torch.no_grad()
    def forward(self, z_scaled, return_dict=False):
        """
        입력:
          z_scaled: [B,4,64,64]  (SD latent, 이미 × scaling_factor 적용된 것)
        출력:
          x: [-1,1] 범위의 [B,3,512,512]
        """
        # 언스케일 → post_quant_conv → decoder
        z = z_scaled / z_scaled.new_tensor(self.scaling_factor)
        z = self.post_quant_conv(z)
        dec_out = self.decoder(z)
        # diffusers Decoder는 DecoderOutput(sample=...)을 반환
        x = dec_out.sample if hasattr(dec_out, "sample") else dec_out
        if return_dict:
            return {"sample": x}
        return x


# ---- 디코더 인스턴스 생성 (아키텍처만 동일, 가중치 로드 없음) ----
decoder_same = SDVAEDecoder().to(DEVICE).eval()
print("Decoder ready (architecture matched).")
print("scaling_factor:", decoder_same.scaling_factor)

# ---- quick sanity check ----
with torch.no_grad():
    z = torch.randn(1, 4, 64, 64, device=DEVICE, dtype=torch.float32)
    x = decoder_same(z)  # [-1,1], [B,3,512,512]

def to_pil_batch(x):
    x01 = (x.clamp(-1,1) + 1) / 2
    x_u8 = (x01 * 255).byte().cpu().permute(0,2,3,1).numpy()
    return [Image.fromarray(arr) for arr in x_u8]

with torch.no_grad():
    # 라틴트가 이미 있으면 그걸 사용, 없으면 더미로 모양만 테스트
    if 'latents' in globals():
        z = latents.to(DEVICE, dtype=next(decoder_same.parameters()).dtype)
    else:
        z = torch.randn(1, 4, 64, 64, device=DEVICE, dtype=next(decoder_same.parameters()).dtype)
        print("(!) `latents` not found; using random latent for shape check")

    # autocast는 fp16에서만 켜기
    use_amp = (DEVICE=="cuda" and next(decoder_same.parameters()).dtype==torch.float16)
    ctx = torch.autocast(device_type="cuda", enabled=use_amp)

    with ctx:
        x = decoder_same(z)  # [-1,1], [B,3,512,512]

imgs = to_pil_batch(x)
out_path = "/content/images/vae_decoder_loaded.png"
imgs[0].save(out_path)
print("Saved:", out_path, " | shape:", x.shape)


Decoder ready (architecture matched).
scaling_factor: 0.18215
Saved: /content/images/vae_decoder_loaded.png  | shape: torch.Size([1, 3, 512, 512])


### **※ 프롬프트 작성 후 디코더 실행! ※**

## **VAE학습**

In [None]:
# 공통 임포트 & 경로
import os, json, math, time
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms, utils
from PIL import Image

from diffusers.models import AutoencoderKL  # 캐시 생성(교사 인코더)에서만 사용

# !pip -q install lpips
try:
    import lpips
    _has_lpips = True
except:
    _has_lpips = False

MODEL_ID = "runwayml/stable-diffusion-v1-5"
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

# 학습/검증 이미지 폴더(이미 가지고 있는 데이터)
data_root = "/content/data_root"   # 예: /content/data_root/train, /content/data_root/val
lat_root  = "/content/latent_cache"  # z_scaled 캐시를 저장할 폴더
Path(lat_root).mkdir(parents=True, exist_ok=True)

img_size = 512
to_tensor_pm1 = transforms.Compose([
    transforms.Resize(img_size, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(img_size),
    transforms.ToTensor(),
    transforms.Lambda(lambda t: t*2.0-1.0),  # [-1,1]
])

def denorm01(x):  # [-1,1] -> [0,1]
    return (x.clamp(-1,1)+1)/2


## **ResNet18 학습**

In [None]:
from pathlib import Path
SRC_REAL = Path("/content/painted-ceramic-dataset-master/data/plate")  # 원본 real
SRC_FAKE = Path("/content/generated")                                  # 원본 fake (png)
DST_REAL = Path("/content/data/real_images_std")                       # 표준화 real 저장 경로
DST_FAKE = Path("/content/data/fake_images_std")                       # 표준화 fake 저장 경로

In [None]:
# === 표준화: 모든 이미지를 RGB, SIZE×SIZE, JPEG로 재인코딩 (원본 유지) ===
import os
from pathlib import Path
from PIL import Image

SIZE = 224           # ResNet 입력 크기 (필요시 512로 변경 가능)
FORMAT = "JPEG"      # "PNG"로 바꿔도 됨
JPEG_QUALITY = 95    # JPEG 품질

def standardize_dir(src_dir: Path, dst_dir: Path, size=SIZE, to_format=FORMAT):
    dst_dir.mkdir(parents=True, exist_ok=True)
    exts = {".jpg",".jpeg",".png",".webp",".bmp",".tiff"}
    cnt = 0
    for p in src_dir.rglob("*"):
        if p.suffix.lower() not in exts or not p.is_file():
            continue
        try:
            img = Image.open(p).convert("RGB")
            img = img.resize((size, size), Image.BICUBIC)
            out = dst_dir / (p.stem + (".jpg" if to_format.upper()=="JPEG" else ".png"))
            if to_format.upper() == "JPEG":
                img.save(out, "JPEG", quality=JPEG_QUALITY, optimize=True)
            else:
                img.save(out, "PNG", optimize=True)
            cnt += 1
        except Exception as e:
            print("skip:", p, e)
    print(f"{src_dir} → {dst_dir} : {cnt} files")

print("Standardizing REAL...")
standardize_dir(SRC_REAL, DST_REAL)
print("Standardizing FAKE...")
standardize_dir(SRC_FAKE, DST_FAKE)


Standardizing REAL...
/content/painted-ceramic-dataset-master/data/plate → /content/data/real_images_std : 1298 files
Standardizing FAKE...
/content/generated → /content/data/fake_images_std : 1200 files


In [None]:
# === 표준화된 이미지를 ImageFolder 구조로 복사 분할 (train/val/test=0.8/0.1/0.1) ===
import os, random, shutil
from pathlib import Path
from typing import List

TARGET_ROOT = Path("/content/data_real")  # ImageFolder 루트 (새로 생성)
SPLIT = {"train": 0.8, "val": 0.1, "test": 0.1}
SEED = 42
random.seed(SEED)

def list_images(d: Path):
    exts = {".jpg",".jpeg",".png",".webp",".bmp",".tiff"}
    return [p for p in d.glob("*") if p.suffix.lower() in exts and p.is_file()]

real_list = list_images(DST_REAL)
fake_list = list_images(DST_FAKE)
random.shuffle(real_list); random.shuffle(fake_list)

def split_list(items, split_cfg):
    n = len(items)
    n_train = int(n * split_cfg["train"])
    n_val   = int(n * split_cfg["val"])
    n_test  = n - n_train - n_val
    return items[:n_train], items[n_train:n_train+n_val], items[n_train+n_val:]

real_tr, real_va, real_te = split_list(real_list, SPLIT)
fake_tr, fake_va, fake_te = split_list(fake_list, SPLIT)

def copy_into(split_name, real_split, fake_split):
    for i, p in enumerate(real_split):
        dst = TARGET_ROOT / split_name / "real" / f"real_{i:06d}{p.suffix.lower()}"
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(p, dst)
    for i, p in enumerate(fake_split):
        dst = TARGET_ROOT / split_name / "fake" / f"fake_{i:06d}{p.suffix.lower()}"
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(p, dst)

copy_into("train", real_tr, real_tr := real_tr)  # mypy quiet
copy_into("val",   real_va, fake_va)
copy_into("test",  real_te, fake_te)

# 위 한 줄 오타 방지: 재정의 제거
# 정확히 다시 복사:
shutil.rmtree(TARGET_ROOT, ignore_errors=True)
def copy_into(split_name, real_split, fake_split):
    for i, p in enumerate(real_split):
        dst = TARGET_ROOT / split_name / "real" / f"real_{i:06d}{p.suffix.lower()}"
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(p, dst)
    for i, p in enumerate(fake_split):
        dst = TARGET_ROOT / split_name / "fake" / f"fake_{i:06d}{p.suffix.lower()}"
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(p, dst)

copy_into("train", real_tr, fake_tr)
copy_into("val",   real_va, fake_va)
copy_into("test",  real_te, fake_te)

print("✅ ImageFolder ready at:", TARGET_ROOT)
for split in ["train","val","test"]:
    pr = TARGET_ROOT/split/"real"; pf = TARGET_ROOT/split/"fake"
    nr = len(list(pr.glob("*"))) if pr.exists() else 0
    nf = len(list(pf.glob("*"))) if pf.exists() else 0
    print(f"{split:5s} | real: {nr:4d} | fake: {nf:4d}")


✅ ImageFolder ready at: /content/data_real
train | real: 1038 | fake:  960
val   | real:  129 | fake:  120
test  | real:  131 | fake:  120


In [None]:
# === DataLoader ===
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
data_root = str(TARGET_ROOT)

train_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])
eval_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

train_set = datasets.ImageFolder(f"{data_root}/train", transform=train_tf)
val_set   = datasets.ImageFolder(f"{data_root}/val",   transform=eval_tf)
test_set  = datasets.ImageFolder(f"{data_root}/test",  transform=eval_tf)

print("class_to_idx:", train_set.class_to_idx)  # 예: {'fake':0, 'real':1}

batch_size = 64  # A100이면 128~256도 가능(메모리 상황에 맞게 조정)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_set,   batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

# pos_weight (양성=real=1 가정) → 불균형 보정
idx_map = train_set.class_to_idx
neg = sum(1 for _, y in train_set.samples if y == idx_map['fake'])
pos = sum(1 for _, y in train_set.samples if y == idx_map['real'])
pos_weight = torch.tensor([neg / max(pos,1)], device=DEVICE, dtype=torch.float32)
print(f"train counts -> fake(0)={neg}, real(1)={pos}, pos_weight={pos_weight.item():.3f}")


class_to_idx: {'fake': 0, 'real': 1}
train counts -> fake(0)=960, real(1)=1038, pos_weight=0.925


In [None]:
# === ResNet18 학습 ===
import torch
import torch.nn as nn
from torchvision.models import resnet18, ResNet18_Weights
from tqdm import tqdm

model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, 1)  # 이진분류 (로짓 1개)
model = model.to(DEVICE)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10*len(train_loader))

def run_epoch(loader, train_mode=True):
    model.train() if train_mode else model.eval()
    total_loss, total_correct, total = 0.0, 0, 0
    for imgs, labels in tqdm(loader, disable=False):
        imgs = imgs.to(DEVICE)
        labels = labels.to(DEVICE).float().unsqueeze(1)  # [B,1]
        if train_mode:
            optimizer.zero_grad(set_to_none=True)
        with torch.set_grad_enabled(train_mode):
            logits = model(imgs)
            loss = criterion(logits, labels)
            if train_mode:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
        total_loss += loss.item() * imgs.size(0)
        preds = (torch.sigmoid(logits) > 0.5).long()
        total_correct += (preds == labels.long()).sum().item()
        total += labels.size(0)
    return total_loss/total, total_correct/total

epochs, best_val = 100, float('inf')
best_path = "/content/resnet18_real_vs_fake_best.pth"

for ep in range(1, epochs+1):
    tr_loss, tr_acc = run_epoch(train_loader, True)
    val_loss, val_acc = run_epoch(val_loader, False)
    print(f"[{ep:02d}/{epochs}] Train {tr_loss:.4f}/{tr_acc:.3f} | Val {val_loss:.4f}/{val_acc:.3f}")
    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), best_path)
        print("  ↳ best saved:", best_path)


100%|██████████| 32/32 [00:02<00:00, 14.62it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[01/100] Train 0.1111/0.954 | Val 0.6221/0.904
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.60it/s]
100%|██████████| 4/4 [00:00<00:00,  7.27it/s]


[02/100] Train 0.0433/0.982 | Val 0.1630/0.944
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.55it/s]
100%|██████████| 4/4 [00:00<00:00,  7.26it/s]


[03/100] Train 0.0300/0.988 | Val 0.0814/0.976
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.60it/s]
100%|██████████| 4/4 [00:00<00:00,  7.37it/s]


[04/100] Train 0.0221/0.993 | Val 0.0451/0.976
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.26it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]


[05/100] Train 0.0115/0.995 | Val 0.0775/0.972


100%|██████████| 32/32 [00:02<00:00, 14.51it/s]
100%|██████████| 4/4 [00:00<00:00,  7.24it/s]


[06/100] Train 0.0074/0.998 | Val 0.0652/0.980


100%|██████████| 32/32 [00:02<00:00, 14.32it/s]
100%|██████████| 4/4 [00:00<00:00,  7.33it/s]


[07/100] Train 0.0027/0.999 | Val 0.0539/0.976


100%|██████████| 32/32 [00:02<00:00, 14.38it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]


[08/100] Train 0.0020/0.999 | Val 0.0737/0.980


100%|██████████| 32/32 [00:02<00:00, 14.38it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[09/100] Train 0.0006/1.000 | Val 0.0625/0.980


100%|██████████| 32/32 [00:02<00:00, 14.13it/s]
100%|██████████| 4/4 [00:00<00:00,  7.57it/s]


[10/100] Train 0.0008/1.000 | Val 0.0697/0.980


100%|██████████| 32/32 [00:02<00:00, 14.45it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[11/100] Train 0.0009/0.999 | Val 0.0688/0.980


100%|██████████| 32/32 [00:02<00:00, 14.72it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[12/100] Train 0.0004/1.000 | Val 0.0632/0.980


100%|██████████| 32/32 [00:02<00:00, 14.42it/s]
100%|██████████| 4/4 [00:00<00:00,  7.42it/s]


[13/100] Train 0.0005/1.000 | Val 0.0579/0.980


100%|██████████| 32/32 [00:02<00:00, 14.43it/s]
100%|██████████| 4/4 [00:00<00:00,  7.44it/s]


[14/100] Train 0.0003/1.000 | Val 0.0611/0.980


100%|██████████| 32/32 [00:02<00:00, 14.19it/s]
100%|██████████| 4/4 [00:00<00:00,  7.36it/s]


[15/100] Train 0.0002/1.000 | Val 0.0562/0.980


100%|██████████| 32/32 [00:02<00:00, 14.54it/s]
100%|██████████| 4/4 [00:00<00:00,  7.42it/s]


[16/100] Train 0.0005/1.000 | Val 0.0525/0.980


100%|██████████| 32/32 [00:02<00:00, 14.78it/s]
100%|██████████| 4/4 [00:00<00:00,  7.38it/s]


[17/100] Train 0.0013/1.000 | Val 0.0673/0.976


100%|██████████| 32/32 [00:02<00:00, 14.73it/s]
100%|██████████| 4/4 [00:00<00:00,  7.42it/s]


[18/100] Train 0.0380/0.989 | Val 0.6567/0.940


100%|██████████| 32/32 [00:02<00:00, 14.80it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[19/100] Train 0.0661/0.977 | Val 0.1353/0.956


100%|██████████| 32/32 [00:02<00:00, 14.21it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[20/100] Train 0.0411/0.984 | Val 0.2695/0.948


100%|██████████| 32/32 [00:02<00:00, 14.87it/s]
100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


[21/100] Train 0.0321/0.988 | Val 0.1729/0.948


100%|██████████| 32/32 [00:02<00:00, 14.61it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[22/100] Train 0.0301/0.987 | Val 0.1709/0.944


100%|██████████| 32/32 [00:02<00:00, 14.44it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]


[23/100] Train 0.0128/0.994 | Val 0.0517/0.972


100%|██████████| 32/32 [00:02<00:00, 14.64it/s]
100%|██████████| 4/4 [00:00<00:00,  7.35it/s]


[24/100] Train 0.0023/1.000 | Val 0.0510/0.980


100%|██████████| 32/32 [00:02<00:00, 14.64it/s]
100%|██████████| 4/4 [00:00<00:00,  7.33it/s]


[25/100] Train 0.0108/0.997 | Val 0.0548/0.976


100%|██████████| 32/32 [00:02<00:00, 14.42it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[26/100] Train 0.0050/0.998 | Val 0.0421/0.984
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.35it/s]
100%|██████████| 4/4 [00:00<00:00,  7.42it/s]


[27/100] Train 0.0019/1.000 | Val 0.0609/0.980


100%|██████████| 32/32 [00:02<00:00, 14.26it/s]
100%|██████████| 4/4 [00:00<00:00,  7.37it/s]


[28/100] Train 0.0011/0.999 | Val 0.0559/0.984


100%|██████████| 32/32 [00:02<00:00, 14.31it/s]
100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


[29/100] Train 0.0013/0.999 | Val 0.0446/0.984


100%|██████████| 32/32 [00:02<00:00, 14.84it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[30/100] Train 0.0005/1.000 | Val 0.0467/0.980


100%|██████████| 32/32 [00:02<00:00, 14.55it/s]
100%|██████████| 4/4 [00:00<00:00,  7.34it/s]


[31/100] Train 0.0014/0.999 | Val 0.0484/0.984


100%|██████████| 32/32 [00:02<00:00, 14.52it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]


[32/100] Train 0.0009/1.000 | Val 0.0467/0.984


100%|██████████| 32/32 [00:02<00:00, 14.23it/s]
100%|██████████| 4/4 [00:00<00:00,  7.44it/s]


[33/100] Train 0.0005/1.000 | Val 0.0511/0.984


100%|██████████| 32/32 [00:02<00:00, 14.96it/s]
100%|██████████| 4/4 [00:00<00:00,  7.37it/s]


[34/100] Train 0.0009/0.999 | Val 0.0642/0.980


100%|██████████| 32/32 [00:02<00:00, 14.55it/s]
100%|██████████| 4/4 [00:00<00:00,  7.38it/s]


[35/100] Train 0.0009/0.999 | Val 0.0529/0.976


100%|██████████| 32/32 [00:02<00:00, 14.24it/s]
100%|██████████| 4/4 [00:00<00:00,  7.34it/s]


[36/100] Train 0.0002/1.000 | Val 0.0611/0.976


100%|██████████| 32/32 [00:02<00:00, 14.27it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[37/100] Train 0.0003/1.000 | Val 0.0663/0.976


100%|██████████| 32/32 [00:02<00:00, 14.22it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[38/100] Train 0.0007/1.000 | Val 0.0681/0.972


100%|██████████| 32/32 [00:02<00:00, 14.87it/s]
100%|██████████| 4/4 [00:00<00:00,  7.48it/s]


[39/100] Train 0.0123/0.995 | Val 0.0427/0.988


100%|██████████| 32/32 [00:02<00:00, 14.58it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[40/100] Train 0.0181/0.993 | Val 0.1331/0.972


100%|██████████| 32/32 [00:02<00:00, 14.51it/s]
100%|██████████| 4/4 [00:00<00:00,  7.49it/s]


[41/100] Train 0.0164/0.994 | Val 0.0392/0.980
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.37it/s]
100%|██████████| 4/4 [00:00<00:00,  7.52it/s]


[42/100] Train 0.0161/0.994 | Val 0.1796/0.960


100%|██████████| 32/32 [00:02<00:00, 14.77it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[43/100] Train 0.0193/0.992 | Val 0.0399/0.988


100%|██████████| 32/32 [00:02<00:00, 14.51it/s]
100%|██████████| 4/4 [00:00<00:00,  7.54it/s]


[44/100] Train 0.0129/0.997 | Val 0.0582/0.984


100%|██████████| 32/32 [00:02<00:00, 14.70it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[45/100] Train 0.0038/0.999 | Val 0.0450/0.980


100%|██████████| 32/32 [00:02<00:00, 14.52it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[46/100] Train 0.0028/0.999 | Val 0.0516/0.984


100%|██████████| 32/32 [00:02<00:00, 14.49it/s]
100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


[47/100] Train 0.0048/0.998 | Val 0.0675/0.980


100%|██████████| 32/32 [00:02<00:00, 14.73it/s]
100%|██████████| 4/4 [00:00<00:00,  7.35it/s]


[48/100] Train 0.0003/1.000 | Val 0.0583/0.980


100%|██████████| 32/32 [00:02<00:00, 14.62it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[49/100] Train 0.0004/1.000 | Val 0.0607/0.980


100%|██████████| 32/32 [00:02<00:00, 14.66it/s]
100%|██████████| 4/4 [00:00<00:00,  7.19it/s]


[50/100] Train 0.0003/1.000 | Val 0.0541/0.984


100%|██████████| 32/32 [00:02<00:00, 14.42it/s]
100%|██████████| 4/4 [00:00<00:00,  7.52it/s]


[51/100] Train 0.0007/0.999 | Val 0.0577/0.980


100%|██████████| 32/32 [00:02<00:00, 14.53it/s]
100%|██████████| 4/4 [00:00<00:00,  7.44it/s]


[52/100] Train 0.0012/0.999 | Val 0.0614/0.980


100%|██████████| 32/32 [00:02<00:00, 14.36it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[53/100] Train 0.0003/1.000 | Val 0.0554/0.980


100%|██████████| 32/32 [00:02<00:00, 14.47it/s]
100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


[54/100] Train 0.0009/1.000 | Val 0.0581/0.980


100%|██████████| 32/32 [00:02<00:00, 14.59it/s]
100%|██████████| 4/4 [00:00<00:00,  7.54it/s]


[55/100] Train 0.0058/0.999 | Val 0.0437/0.984


100%|██████████| 32/32 [00:02<00:00, 14.61it/s]
100%|██████████| 4/4 [00:00<00:00,  7.35it/s]


[56/100] Train 0.0061/0.997 | Val 0.0398/0.984


100%|██████████| 32/32 [00:02<00:00, 14.90it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[57/100] Train 0.0013/0.999 | Val 0.0537/0.980


100%|██████████| 32/32 [00:02<00:00, 14.36it/s]
100%|██████████| 4/4 [00:00<00:00,  7.45it/s]


[58/100] Train 0.0023/0.999 | Val 0.0709/0.980


100%|██████████| 32/32 [00:02<00:00, 14.63it/s]
100%|██████████| 4/4 [00:00<00:00,  7.19it/s]


[59/100] Train 0.0036/0.999 | Val 0.0564/0.984


100%|██████████| 32/32 [00:02<00:00, 14.49it/s]
100%|██████████| 4/4 [00:00<00:00,  7.48it/s]


[60/100] Train 0.0039/0.998 | Val 0.0830/0.976


100%|██████████| 32/32 [00:02<00:00, 14.76it/s]
100%|██████████| 4/4 [00:00<00:00,  7.35it/s]


[61/100] Train 0.0133/0.995 | Val 0.1784/0.948


100%|██████████| 32/32 [00:02<00:00, 14.37it/s]
100%|██████████| 4/4 [00:00<00:00,  7.50it/s]


[62/100] Train 0.0181/0.994 | Val 0.0238/0.988
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.66it/s]
100%|██████████| 4/4 [00:00<00:00,  7.47it/s]


[63/100] Train 0.0139/0.996 | Val 0.0382/0.980


100%|██████████| 32/32 [00:02<00:00, 14.20it/s]
100%|██████████| 4/4 [00:00<00:00,  7.33it/s]


[64/100] Train 0.0115/0.997 | Val 0.0425/0.984


100%|██████████| 32/32 [00:02<00:00, 14.49it/s]
100%|██████████| 4/4 [00:00<00:00,  7.25it/s]


[65/100] Train 0.0025/1.000 | Val 0.0226/0.992
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.78it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[66/100] Train 0.0006/1.000 | Val 0.0215/0.992
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.60it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[67/100] Train 0.0005/1.000 | Val 0.0254/0.988


100%|██████████| 32/32 [00:02<00:00, 14.58it/s]
100%|██████████| 4/4 [00:00<00:00,  7.29it/s]


[68/100] Train 0.0006/1.000 | Val 0.0224/0.984


100%|██████████| 32/32 [00:02<00:00, 14.57it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[69/100] Train 0.0003/1.000 | Val 0.0184/0.996
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.79it/s]
100%|██████████| 4/4 [00:00<00:00,  7.34it/s]


[70/100] Train 0.0002/1.000 | Val 0.0182/0.992
  ↳ best saved: /content/resnet18_real_vs_fake_best.pth


100%|██████████| 32/32 [00:02<00:00, 14.45it/s]
100%|██████████| 4/4 [00:00<00:00,  7.52it/s]


[71/100] Train 0.0002/1.000 | Val 0.0216/0.984


100%|██████████| 32/32 [00:02<00:00, 14.85it/s]
100%|██████████| 4/4 [00:00<00:00,  7.54it/s]


[72/100] Train 0.0003/1.000 | Val 0.0195/0.988


100%|██████████| 32/32 [00:02<00:00, 14.51it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[73/100] Train 0.0003/1.000 | Val 0.0250/0.984


100%|██████████| 32/32 [00:02<00:00, 14.53it/s]
100%|██████████| 4/4 [00:00<00:00,  7.41it/s]


[74/100] Train 0.0002/1.000 | Val 0.0218/0.984


100%|██████████| 32/32 [00:02<00:00, 14.36it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[75/100] Train 0.0002/1.000 | Val 0.0215/0.988


100%|██████████| 32/32 [00:02<00:00, 14.40it/s]
100%|██████████| 4/4 [00:00<00:00,  7.56it/s]


[76/100] Train 0.0003/1.000 | Val 0.0182/0.984


100%|██████████| 32/32 [00:02<00:00, 14.60it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]


[77/100] Train 0.0009/1.000 | Val 0.0336/0.984


100%|██████████| 32/32 [00:02<00:00, 14.54it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]


[78/100] Train 0.0007/1.000 | Val 0.0330/0.984


100%|██████████| 32/32 [00:02<00:00, 14.43it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[79/100] Train 0.0005/1.000 | Val 0.0434/0.992


100%|██████████| 32/32 [00:02<00:00, 14.16it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[80/100] Train 0.0001/1.000 | Val 0.0446/0.992


100%|██████████| 32/32 [00:02<00:00, 14.43it/s]
100%|██████████| 4/4 [00:00<00:00,  7.27it/s]


[81/100] Train 0.0000/1.000 | Val 0.0427/0.980


100%|██████████| 32/32 [00:02<00:00, 14.49it/s]
100%|██████████| 4/4 [00:00<00:00,  7.32it/s]


[82/100] Train 0.0000/1.000 | Val 0.0412/0.992


100%|██████████| 32/32 [00:02<00:00, 14.72it/s]
100%|██████████| 4/4 [00:00<00:00,  7.51it/s]


[83/100] Train 0.0000/1.000 | Val 0.0375/0.980


100%|██████████| 32/32 [00:02<00:00, 14.70it/s]
100%|██████████| 4/4 [00:00<00:00,  7.32it/s]


[84/100] Train 0.0000/1.000 | Val 0.0417/0.984


100%|██████████| 32/32 [00:02<00:00, 14.56it/s]
100%|██████████| 4/4 [00:00<00:00,  7.42it/s]


[85/100] Train 0.0000/1.000 | Val 0.0402/0.980


100%|██████████| 32/32 [00:02<00:00, 14.27it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[86/100] Train 0.0000/1.000 | Val 0.0475/0.980


100%|██████████| 32/32 [00:02<00:00, 14.78it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[87/100] Train 0.0000/1.000 | Val 0.0413/0.980


100%|██████████| 32/32 [00:02<00:00, 14.72it/s]
100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


[88/100] Train 0.0001/1.000 | Val 0.0436/0.984


100%|██████████| 32/32 [00:02<00:00, 14.68it/s]
100%|██████████| 4/4 [00:00<00:00,  7.34it/s]


[89/100] Train 0.0000/1.000 | Val 0.0412/0.980


100%|██████████| 32/32 [00:02<00:00, 14.57it/s]
100%|██████████| 4/4 [00:00<00:00,  7.42it/s]


[90/100] Train 0.0000/1.000 | Val 0.0372/0.980


100%|██████████| 32/32 [00:02<00:00, 14.51it/s]
100%|██████████| 4/4 [00:00<00:00,  7.44it/s]


[91/100] Train 0.0000/1.000 | Val 0.0378/0.980


100%|██████████| 32/32 [00:02<00:00, 14.71it/s]
100%|██████████| 4/4 [00:00<00:00,  7.43it/s]


[92/100] Train 0.0000/1.000 | Val 0.0502/0.980


100%|██████████| 32/32 [00:02<00:00, 14.75it/s]
100%|██████████| 4/4 [00:00<00:00,  7.31it/s]


[93/100] Train 0.0000/1.000 | Val 0.0369/0.980


100%|██████████| 32/32 [00:02<00:00, 14.63it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[94/100] Train 0.0000/1.000 | Val 0.0363/0.980


100%|██████████| 32/32 [00:02<00:00, 14.54it/s]
100%|██████████| 4/4 [00:00<00:00,  7.30it/s]


[95/100] Train 0.0000/1.000 | Val 0.0364/0.980


100%|██████████| 32/32 [00:02<00:00, 14.51it/s]
100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


[96/100] Train 0.0000/1.000 | Val 0.0653/0.988


100%|██████████| 32/32 [00:02<00:00, 14.44it/s]
100%|██████████| 4/4 [00:00<00:00,  7.48it/s]


[97/100] Train 0.0000/1.000 | Val 0.0459/0.980


100%|██████████| 32/32 [00:02<00:00, 14.76it/s]
100%|██████████| 4/4 [00:00<00:00,  7.28it/s]


[98/100] Train 0.0000/1.000 | Val 0.0515/0.984


100%|██████████| 32/32 [00:02<00:00, 14.58it/s]
100%|██████████| 4/4 [00:00<00:00,  7.32it/s]


[99/100] Train 0.0000/1.000 | Val 0.0433/0.980


100%|██████████| 32/32 [00:02<00:00, 14.75it/s]
100%|██████████| 4/4 [00:00<00:00,  7.31it/s]

[100/100] Train 0.0000/1.000 | Val 0.0467/0.976





In [None]:
# === Test evaluation ===
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score, classification_report

# 베스트 로드
model.load_state_dict(torch.load("/content/resnet18_real_vs_fake_best.pth", map_location=DEVICE))
model.eval()

all_labels, all_probs, all_preds = [], [], []

with torch.no_grad():
    for imgs, labels in tqdm(test_loader, disable=False):
        imgs = imgs.to(DEVICE)
        labels = labels.to(DEVICE)
        logits = model(imgs)
        probs = torch.sigmoid(logits).squeeze(1)
        preds = (probs > 0.5).long()
        all_labels.append(labels.cpu().numpy())
        all_probs.append(probs.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

y_true = np.concatenate(all_labels)
y_prob = np.concatenate(all_probs)
y_pred = np.concatenate(all_preds)

acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
cm = confusion_matrix(y_true, y_pred)
try:
    auc = roc_auc_score(y_true, y_prob)
except Exception:
    auc = float('nan')

print(f"\n=== Test Metrics ===")
print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")
print(f"ROC-AUC  : {auc:.3f}")
print("Confusion Matrix (rows=true [0=fake,1=real], cols=pred):\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3, zero_division=0))


100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


=== Test Metrics ===
Accuracy : 0.992
Precision: 0.992 | Recall: 0.992 | F1: 0.992
ROC-AUC  : 1.000
Confusion Matrix (rows=true [0=fake,1=real], cols=pred):
 [[119   1]
 [  1 130]]

Classification Report:
               precision    recall  f1-score   support

           0      0.992     0.992     0.992       120
           1      0.992     0.992     0.992       131

    accuracy                          0.992       251
   macro avg      0.992     0.992     0.992       251
weighted avg      0.992     0.992     0.992       251






## **프롬프트**

In [None]:
prompt = "a single plate, ceramic, ultra detailed, one color background, no shadow"
negative_prompt = "blurry, lowres, text, watermark, logo, distorted"

num_inference_steps = 30        # 20~35 권장 (빠르게는 20)
guidance_scale = 7.0            # 6.5~8.0 권장 (너무 높으면 과제약/노이즈)
height, width = 512, 512        # SD 1.x 기본 512 (다를 경우 UNet 타일링/VRAM 주의)
seed = 42                       # 재현성 필요하면 고정

generator = torch.Generator(device).manual_seed(seed)

with torch.inference_mode():
    out = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        height=height, width=width,
        generator=generator,
        output_type="latent"    # 라틴트만 받기
    )
latents = out.images            # [B,4,64,64], 스케일 적용됨


  0%|          | 0/30 [00:00<?, ?it/s]

## **DCGAN 이미지 저장 (작동 X)**

In [None]:
with torch.no_grad():
    dtype = next(decoder_same.parameters()).dtype
    z = latents.to(device, dtype=dtype)
    x = decoder_same(z)                       # [-1,1]
x01 = ((x.clamp(-1,1) + 1) / 2)

save_path = "/content/images/test1.png"
pil_images[0].save(save_path)
print("Saved:", save_path)

FileNotFoundError: [Errno 2] No such file or directory: '/content/images/test1.png'

## **디버깅용**

In [None]:
print("latents:", latents.dtype if 'latents' in globals() else None)
try:
    p = next(dcgan_decoder.parameters())
    print("decoder weight:", p.dtype)
except Exception as e:
    print("decoder:", e)

latents: torch.float32
decoder weight: torch.float32


In [None]:
import shutil

# 저장된 이미지 폴더 경로
folder_path = "/content/generated"

# 압축파일 이름 (확장자 제외)
zip_name = "/content/fake"

# zip 파일 생성: /content/generated_images.zip
shutil.make_archive(zip_name, 'zip', folder_path)

print("압축 완료:", zip_name + ".zip")

압축 완료: /content/fake.zip
