In [1]:
import torch
import torch.nn as nn
import numpy as np
import torchvision.datasets as dataset
import torchvision.transforms as transform
from torch.utils.data import DataLoader
import time

In [2]:
# Training dataset 다운로드
cifar10_train = dataset.CIFAR10(root = "./", # 데이터셋을 저장할 위치
                            train = True,
                            transform = transform.ToTensor(),
                            download = True)
# Testing dataset 다운로드
cifar10_test = dataset.CIFAR10(root = "./",
                            train = False,
                            transform = transform.ToTensor(),
                            download = True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:37<00:00, 4579125.40it/s]


Extracting ./cifar-10-python.tar.gz to ./
Files already downloaded and verified


In [3]:
def infer_dataset_size(data):
    if isinstance(data, (list, tuple, dict, torch.utils.data.Dataset)):
        return len(data)
    elif isinstance(data, torch.Tensor) or isinstance(data, np.ndarray):
        return data.shape[0]
    elif hasattr(data, '__len__'):
        return len(data)
    elif hasattr(data, 'shape'):
        return data.shape[0]
    else:
        raise TypeError("지원하지 않는 데이터셋 타입입니다.")


In [4]:
train_len = infer_dataset_size(cifar10_train)
print("Training 데이터 크기:", train_len)

Training 데이터 크기: 50000


In [5]:
class ResNet (nn.Module):
  def __init__ (self):
    super(ResNet, self).__init__()

    self.conv1_1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)    # Convolution: [3x3x3] x 16, s1, p1
    self.conv1_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)   # Convolution: [3x3x16] x 32, s1, p1

    self.conv2_1 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1)   # Convolution: [3x3x32] x 32, s1, p1
    self.conv2_2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)   # Convolution: [3x3x64] x 64, s1, p1

    self.conv3_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)  # Convolution: [3x3x64] x 128, s1, p1
    self.conv3_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1) # Convolution: [3x3x128] x 256, s1, p1

    self.fc1 = nn.Linear(4096, 512)   # Fully connected layer: 4096 x 512
    self.fc2 = nn.Linear(512, 256)    # Fully connected layer: 512 x 256
    self.fc3 = nn.Linear(256, 10)     # Fully connected layer: 256 x 10

    self.conv1_skip = nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,padding=1)
    self.conv2_skip = nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3,padding=1)
    self.conv3_skip = nn.Conv2d(in_channels=64,out_channels=256,kernel_size=3,padding=1)

    # 파라미터를 가지지 않은 layer는 한 번만 선언해도 문제 없음
    self.relu = nn.ReLU()
    self.avgPool2d = nn.AvgPool2d(kernel_size=2, stride=2)


  def forward(self, x):

    input_feature1 = x

    # convolution layers
    out = self.relu(self.conv1_1(x))
    out = self.relu(self.conv1_2(out))
    input_skip1 = self.relu(self.conv1_skip(input_feature1))
    out = torch.add(out, input_skip1)
    out = self.avgPool2d(out)
  
    input_feature2 = out

    out = self.relu(self.conv2_1(out))
    out = self.relu(self.conv2_2(out))
    input_skip2 = self.relu(self.conv2_skip(input_feature2))
    out = torch.add(out,input_skip2)
    out = self.avgPool2d(out)

    

    input_feature3 = out

    out = self.relu(self.conv3_1(out))
    out = self.relu(self.conv3_2(out))
    input_skip3 = self.relu(self.conv3_skip(input_feature3))
    out = torch.add(out,input_skip3)
    out = self.avgPool2d(out)

    # 평탄화
    out = out.reshape(-1, 4096)

    # fully connected layers
    out = self.relu(self.fc1(out))
    out = self.relu(self.fc2(out))
    out = self.fc3(out)

    return out

In [6]:
# Hyper-parameters 지정
batch_size = 100
learning_rate = 0.1
training_epochs = 20
loss_function = nn.CrossEntropyLoss()
network = ResNet()
optimizer = torch.optim.SGD(network.parameters(), lr = learning_rate)
data_loader = DataLoader(dataset=cifar10_train,
                         batch_size=batch_size,
                         shuffle=True,
                         drop_last=True)

# 학습을 위한 반복문 진행
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda:0'

network = network.to(device)
# 학습 루프
for epoch in range(training_epochs):
    start_time = time.time()  # ⏱️ 에포크 시작 시간 기록

    avg_cost = 0
    total_batch = len(data_loader)

    for img, label in data_loader:
        img = img.to(device)
        label = label.to(device)

        pred = network(img)
        loss = loss_function(pred, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_cost += loss / total_batch

    end_time = time.time()  # ⏱️ 에포크 종료 시간
    epoch_time = end_time - start_time

    print(f'Epoch: {epoch+1:2d} | Loss = {avg_cost:.4f} | Time = {epoch_time:.2f} sec')

print('Learning finished!')

# 정확도 평가
network = network.to('cpu')
with torch.no_grad():
    img_test = torch.tensor(np.transpose(cifar10_test.data, (0, 3, 1, 2))) / 255.
    label_test = torch.tensor(cifar10_test.targets)

    prediction = network(img_test)
    correct_prediction = torch.argmax(prediction, 1) == label_test
    accuracy = correct_prediction.float().mean()
    print('Accuracy:', accuracy.item())

Epoch:  1 | Loss = 2.1677 | Time = 5.81 sec
Epoch:  2 | Loss = 1.8378 | Time = 5.10 sec
Epoch:  3 | Loss = 1.5857 | Time = 4.96 sec
Epoch:  4 | Loss = 1.4419 | Time = 5.02 sec
Epoch:  5 | Loss = 1.3238 | Time = 4.50 sec
Epoch:  6 | Loss = 1.2104 | Time = 4.51 sec
Epoch:  7 | Loss = 1.1208 | Time = 4.34 sec
Epoch:  8 | Loss = 1.0192 | Time = 4.73 sec
Epoch:  9 | Loss = 0.9276 | Time = 5.16 sec
Epoch: 10 | Loss = 0.8355 | Time = 5.50 sec
Epoch: 11 | Loss = 0.7336 | Time = 5.19 sec
Epoch: 12 | Loss = 0.6475 | Time = 5.11 sec
Epoch: 13 | Loss = 0.5427 | Time = 5.51 sec
Epoch: 14 | Loss = 0.4473 | Time = 5.22 sec
Epoch: 15 | Loss = 0.3515 | Time = 5.08 sec
Epoch: 16 | Loss = 0.2748 | Time = 4.95 sec
Epoch: 17 | Loss = 0.2058 | Time = 4.83 sec
Epoch: 18 | Loss = 0.1675 | Time = 5.06 sec
Epoch: 19 | Loss = 0.1301 | Time = 4.78 sec
Epoch: 20 | Loss = 0.1113 | Time = 4.59 sec
Learning finished!
Accuracy: 0.6843000054359436


In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.profiler
import time

# model = ResNet().cuda()
model = ResNet().cuda()
model.train()  # 학습 모드

# 더미 입력과 레이블 준비
dummy_input = torch.randn(100, 3, 32, 32).cuda()  # 배치 사이즈 100
dummy_target = torch.randint(0, 10, (100,)).cuda()

# 손실 함수와 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 프로파일링 시작
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    with_flops=True,   # (선택) FLOPS도 계산
) as prof:
    # 학습 루프 한 번
    start_time = time.time()
    optimizer.zero_grad()
    output = model(dummy_input)
    loss = criterion(output, dummy_target)
    loss.backward()
    optimizer.step()
    end_time = time.time()

# 결과 출력
print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20))
t = (end_time - start_time) * 500
print(f'Time = {t:.2f} sec')




-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         6.52%     512.400us         6.52%     512.400us      56.933us     163.319ms        93.35%     163.319ms      18.147ms           0 

In [None]:
import pandas as pd

# 프로파일링 결과 문자열
profile_table_str = prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=100)

# 줄 단위로 분리
lines = profile_table_str.split('\n')

# 유효한 데이터 줄만 필터링
data_lines = []
for line in lines:
    line = line.strip()
    if not line or line.startswith('-') or 'Self CPU time total' in line or 'Self CUDA time total' in line:
        continue
    data_lines.append(line)

# 첫 번째 줄은 헤더, 나머지는 데이터
header_line = data_lines[0]
data_lines = data_lines[1:]

# 헤더 컬럼명 파싱 (공백 기준 split) + strip 추가
columns = [col.strip() for col in header_line.split('  ') if col.strip()]

# 데이터 파싱
parsed_data = []
for line in data_lines:
    fields = [field.strip() for field in line.split('  ') if field.strip()]
    if len(fields) == len(columns):
        parsed_data.append(fields)

# DataFrame 생성
df = pd.DataFrame(parsed_data, columns=columns)

# 🔥 시간 컬럼 리스트
time_columns = ['Self CPU', 'CPU total', 'CPU time avg', 'Self CUDA', 'CUDA total', 'CUDA time avg']

# 시간 컬럼 처리 함수
def convert_to_ms(x):
    if isinstance(x, str):
        x = x.strip()
        if x.endswith('us'):
            return float(x.replace('us', '')) / 1000  # us(마이크로초) -> ms(밀리초)
        elif x.endswith('ms'):
            return float(x.replace('ms', ''))          # ms는 그대로
        elif x.endswith('s'):
            return float(x.replace('s', '')) * 1000     # s(초) -> ms로 변환
        else:
            return float(x)
    else:
        return x  # 이미 숫자형이면 그대로


# 모든 시간 컬럼에 적용
for col in time_columns:
    if col in df.columns:
        df[col] = df[col].apply(convert_to_us)

# 결과 확인
df


Unnamed: 0,Name,Self CPU %,Self CPU,CPU total %,CPU total,CPU time avg,Self CUDA,Self CUDA %,CUDA total,CUDA time avg,CPU Mem,Self CPU Mem,CUDA Mem,Self CUDA Mem,# of Calls,Total MFLOPs
0,aten::cudnn_convolution,6.52%,512.4,6.52%,512.4,56.933,163319.0,93.35%,163319.0,18147.000,0 b,0 b,62.50 Mb,62.50 Mb,9,--
1,aten::convolution_backward,13.68%,1076.0,17.56%,1381.0,153.411,3256.0,1.86%,3604.0,400.444,0 b,0 b,24.70 Mb,22.41 Mb,9,--
2,aten::clamp_min,2.47%,193.9,2.47%,193.9,17.627,1418.0,0.81%,1418.0,128.909,0 b,0 b,62.79 Mb,62.79 Mb,11,--
3,autograd::engine::evaluate_function: torch::au...,3.85%,302.9,11.20%,880.7,36.696,1037.0,0.59%,1737.0,72.375,0 b,0 b,0 b,0 b,24,--
4,aten::add,0.68%,53.2,0.68%,53.2,17.733,880.0,0.50%,880.0,293.333,0 b,0 b,25.00 Mb,25.00 Mb,3,6.554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,aten::empty_like,0.21%,16.8,0.29%,23.0,23.000,3.0,0.00%,5.0,5.000,0 b,0 b,512 b,0 b,1,--
60,aten::zero_,0.19%,15.3,0.41%,32.0,32.000,3.0,0.00%,22.0,22.000,0 b,0 b,0 b,0 b,1,--
61,autograd::engine::evaluate_function: LogSoftma...,0.19%,15.0,0.56%,43.7,43.700,3.0,0.00%,30.0,30.000,0 b,0 b,-4.00 Kb,-8.00 Kb,1,--
62,aten::empty_strided,0.08%,6.2,0.08%,6.2,6.200,2.0,0.00%,2.0,2.000,0 b,0 b,512 b,512 b,1,--


In [52]:
df_reduced = df[['Name','Self CUDA', 'CUDA total']].copy()
df_reduced

Unnamed: 0,Name,Self CUDA,CUDA total
0,aten::cudnn_convolution,259293.0,259293.0
1,aten::add_,5404.0,5404.0
2,aten::convolution_backward,3248.0,3595.0
3,aten::clamp_min,1145.0,1145.0
4,aten::threshold_backward,1121.0,1121.0
...,...,...,...
60,aten::empty_strided,2.0,2.0
61,NllLossBackward0,2.0,75.0
62,aten::resize_,1.0,1.0
63,[memory],0.0,0.0


In [53]:

top20_df = df_reduced.sort_values(by='Self CUDA', ascending=False).head(20)

# 결과 출력
top20_df

Unnamed: 0,Name,Self CUDA,CUDA total
0,aten::cudnn_convolution,259293.0,259293.0
1,aten::add_,5404.0,5404.0
2,aten::convolution_backward,3248.0,3595.0
3,aten::clamp_min,1145.0,1145.0
4,aten::threshold_backward,1121.0,1121.0
5,autograd::engine::evaluate_function: torch::au...,840.0,1326.0
6,aten::reshape,825.0,839.0
7,aten::avg_pool2d_backward,373.0,373.0
8,aten::sum,333.0,346.0
9,aten::_foreach_add_,272.0,334.0
