In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch_directml
import timm  # timm 라이브러리 사용
import gc  # Garbage Collector 사용

def free_memory():
    # Python 메모리 관리
    gc.collect()
    # PyTorch의 GPU 메모리 해제
    if torch.has_cuda:
        torch.cuda.empty_cache()

# DirectML 디바이스 설정
device = torch_directml.device()

# 데이터셋 로드 및 전처리
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),  # ResNet은 3채널 입력 필요
    transforms.Resize((224, 224)),  # ViT 입력 크기 조정
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1]
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# ViT 모델 불러오기
vit_model = timm.create_model('vit_base_patch16_224', pretrained=True)  # ViT Base 모델
vit_model.head = nn.Linear(vit_model.head.in_features, 10)  # MNIST 클래스 10개에 맞게 수정
vit_model = vit_model.to(device)

# 손실 함수 및 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vit_model.parameters(), lr=0.001)

# 학습 함수 정의
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        free_memory()

        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}] '
                  f'Loss: {loss.item():.6f}')

# 평가 함수 정의
def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # 배치 손실 더하기
            pred = output.argmax(dim=1, keepdim=True)  # 가장 높은 확률의 클래스 예측
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '
          f'({100. * correct / len(test_loader.dataset):.2f}%)\n')

# 학습 실행
num_epochs = 1
for epoch in range(1, num_epochs + 1):
    train(vit_model, device, train_loader, optimizer, criterion, epoch)
    test(vit_model, device, test_loader, criterion)


  torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)
  if torch.has_cuda:


Epoch: 1 [0/60000] Loss: 2.354912
