In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import time
import random

import numpy as np
import torch
import torch.nn as nn
from torchvision.models import resnet152
from torchvision.datasets import CIFAR10, CIFAR100
from torchvision.transforms import v2
from torch.utils.data import DataLoader

# seed固定
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [13]:
# Datset
transform = v2.Compose([
    v2.ToTensor(),
    v2.ToDtype(torch.float, scale=True)
])

# dataset = CIFAR10(
#     root="./data",
#     transform=transform,
#     download=True
# )
dataset = CIFAR100(
    root="./data",
    transform=transform,
    download=True
)


dataloader = DataLoader(
    dataset,
    batch_size=2056,
    shuffle=True,
    num_workers=4,
    drop_last=False
)

Files already downloaded and verified


In [14]:
len(dataset)

50000

In [15]:
# Model
model = resnet152(weights="IMAGENET1K_V1")
in_features = model.fc.in_features
# model.fc = nn.Linear(in_features, 10) # CIFAR 10
model.fc = nn.Linear(in_features, 100)

# Others
lr = 0.0001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=lr,
    betas=[0.9, 0.999],
    weight_decay=0.0001
)

In [16]:
'''
課題1

GPU数ごとの学習時間の計測
'''

# Experiment Settings
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = nn.DataParallel(model)
model = model.to(device)

cuda


In [None]:
# Experience

# Time
start_time = time.time()

epochs = 5
for epoch in range(epochs):
    train_loss = 0.0
    model.train()
    for img, label in dataloader:
        optimizer.zero_grad()
        img, label = img.to(device), label.to(device)
        p_label = model(img)
        loss = criterion(p_label, label)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    train_loss /= len(dataloader)
    print(f"epoch: {epoch+1}, train loss: {train_loss}")

end_time = time.time()
print(f"学習時間: {end_time - start_time:.1f}seconds")


epoch: 1, train loss: 3.965126323699951
epoch: 2, train loss: 2.5156394481658935
epoch: 3, train loss: 1.6912783765792847
epoch: 4, train loss: 1.0810856628417969
epoch: 5, train loss: 0.6142773771286011
学習時間: 45.7seconds


### CIFAR 10結果

In [18]:
# Batch 512, ResNet101

# GPU 1
# epoch: 1, train loss: 1.0761167759798012
# epoch: 2, train loss: 0.4412334415377403
# epoch: 3, train loss: 0.17696894462011298
# epoch: 4, train loss: 0.08636364623980254
# epoch: 5, train loss: 0.07115460871433725
# 学習時間: 49.6seconds

# GPU 2
# epoch: 1, train loss: 1.1051142185318226
# epoch: 2, train loss: 0.49897604785403427
# epoch: 3, train loss: 0.2556265328003436
# epoch: 4, train loss: 0.14925496857993456
# epoch: 5, train loss: 0.09898413162754506
# 学習時間: 105.9seconds

# GPU 4
# epoch: 1, train loss: 1.1051142185318226
# epoch: 2, train loss: 0.49897604785403427
# epoch: 3, train loss: 0.2556265328003436
# epoch: 4, train loss: 0.14925496857993456
# epoch: 5, train loss: 0.09898413162754506
# 学習時間: 106.3seconds

In [19]:
# Batch 1024, ResNet152

# GPU 1
# epoch: 1, train loss: 1.1836567788707966
# epoch: 2, train loss: 0.47651875201536686
# epoch: 3, train loss: 0.17617712489196233
# epoch: 4, train loss: 0.059560982624487
# epoch: 5, train loss: 0.03227344140106318
# 学習時間: 64.1seconds

# GPU2
# epoch: 1, train loss: 1.1858141397943303
# epoch: 2, train loss: 0.4831528505500482
# epoch: 3, train loss: 0.18664904820675754
# epoch: 4, train loss: 0.07077507781130928
# epoch: 5, train loss: 0.04813307694786666
# 学習時間: 66.0seconds

# GPU4
# epoch: 1, train loss: 1.1858141397943303
# epoch: 2, train loss: 0.4831528505500482
# epoch: 3, train loss: 0.18664904820675754
# epoch: 4, train loss: 0.07077507781130928
# epoch: 5, train loss: 0.04813307694786666
# 学習時間: 62.5seconds

In [20]:
# Batch 2056, ResNet152

# GPU 2
# epoch: 1, train loss: 1.3980751490592958
# epoch: 2, train loss: 0.6175890874862671
# epoch: 3, train loss: 0.29048303663730624
# epoch: 4, train loss: 0.10367942631244659
# epoch: 5, train loss: 0.038534104675054554
# 学習時間: 45.3seconds

# おまけ GPU 1
# epoch: 1, train loss: 1.3995373821258545
# epoch: 2, train loss: 0.6193314146995544
# epoch: 3, train loss: 0.2864131611585617
# epoch: 4, train loss: 0.09750533431768417
# epoch: 5, train loss: 0.03211120568215847
# 学習時間: 85.2seconds

# おまけ GPU 4（GPU2と全く同じ）
# epoch: 1, train loss: 1.3980751490592958
# epoch: 2, train loss: 0.6175890874862671
# epoch: 3, train loss: 0.29048303663730624
# epoch: 4, train loss: 0.10367942631244659
# epoch: 5, train loss: 0.038534104675054554
# 学習時間: 45.3seconds

### CIFAR 100結果

In [None]:
# Batch 1024, GPU1
# epoch: 1, train loss: 3.5558944672954325
# epoch: 2, train loss: 1.9490852112672767
# epoch: 3, train loss: 1.1722316109404272
# epoch: 4, train loss: 0.6205556100728561
# epoch: 5, train loss: 0.2922739988687087
# 学習時間: 61.9seconds

# Batch 1024, GPU2
# epoch: 1, train loss: 3.5558944672954325
# epoch: 2, train loss: 1.9490852112672767
# epoch: 3, train loss: 1.1722316109404272
# epoch: 4, train loss: 0.6205556100728561
# epoch: 5, train loss: 0.2922739988687087
# 学習時間: 63.5seconds

# Batch 1024, GPU4
# epoch: 1, train loss: 3.6306635311671664
# epoch: 2, train loss: 2.0852587101410847
# epoch: 3, train loss: 1.325660406326761
# epoch: 4, train loss: 0.7813601311372251
# epoch: 5, train loss: 0.4197270438379171

# 学習時間: 92.7seconds

In [None]:
# Batch 2056, GPU1
# epoch: 1, train loss: 3.942271032333374
# epoch: 2, train loss: 2.461391625404358
# epoch: 3, train loss: 1.6243526172637939
# epoch: 4, train loss: 1.0069246244430543
# epoch: 5, train loss: 0.5506624889373779
# 学習時間: 86.5seconds

# Batch 2056, GPU2
# epoch: 1, train loss: 3.965126323699951
# epoch: 2, train loss: 2.5156394481658935
# epoch: 3, train loss: 1.6912783765792847
# epoch: 4, train loss: 1.0810856628417969
# epoch: 5, train loss: 0.6142773771286011
# 学習時間: 49.1seconds

# Batch 2056, GPU4
# epoch: 1, train loss: 3.965126323699951
# epoch: 2, train loss: 2.5156394481658935
# epoch: 3, train loss: 1.6912783765792847
# epoch: 4, train loss: 1.0810856628417969
# epoch: 5, train loss: 0.6142773771286011
# 学習時間: 45.7seconds
