In [1]:
import os
import torch
from torchvision import datasets, transforms

torch.manual_seed(27)

<torch._C.Generator at 0x1b4191baaf0>

## DataLoader

> PyTorch에서 DataLoader 만들기

- `torch.utils.data.DataLoader` 사용

- `torch.utils.data.DataLoader`는 기본적으로 아래 두 가지 인수를 받는다.

    1. `torch.utils.data.dataset.Dataset` : data generator

    2. batch_size

- `torch.utils.data.DataLoader`의 기능은 generator에서 batch_size 만큼의 데이터 샘플을 얻는 것이다.

In [2]:
batch_size = 32

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        root='dataset/', train=True, download=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize(mean=(0.5,), std=(0.5,))]
        )
    ),
    batch_size=batch_size,
    shuffle=True
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to dataset/MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:10<00:00, 904882.61it/s] 


Extracting dataset/MNIST\raw\train-images-idx3-ubyte.gz to dataset/MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to dataset/MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 149310.60it/s]


Extracting dataset/MNIST\raw\train-labels-idx1-ubyte.gz to dataset/MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to dataset/MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:01<00:00, 951744.37it/s] 


Extracting dataset/MNIST\raw\t10k-images-idx3-ubyte.gz to dataset/MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to dataset/MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 2277409.30it/s]

Extracting dataset/MNIST\raw\t10k-labels-idx1-ubyte.gz to dataset/MNIST\raw






In [3]:
x, y = next(iter(train_loader))
x.shape, y.shape

(torch.Size([32, 1, 28, 28]), torch.Size([32]))

In [4]:
generator = datasets.MNIST(
    root='dataset/', train=True, download=True,
    transform=transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize(mean=(0.5,), std=(0.5,))]
    )
)
generator

Dataset MNIST
    Number of datapoints: 60000
    Root location: dataset/
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5,), std=(0.5,))
           )

In [5]:
# generator는 데이터 샘플을 하나씩 가져온다.

x, y = next(iter(generator))
x.shape, y

(torch.Size([1, 28, 28]), 5)

## Local Data

> 로컬에 있는 데이터를 torch.utils.data.dataset.Dataset 으로 만들기


### - ImageFolder

- ImageFolder : 간단하게 로컬에 있는 이미지 데이터셋을 불러올 수 있다.

    디렉토리 구조가 다음과 같아야 한다.

    - dataset
        - class0</br>
            － xx.png</br>
            － yy.png</br>
            － ...</br>
        - class1</br>
            － xx.png</br>
            － yy.png</br>
            － ...</br>
        - class2</br>
            － xx.png</br>
            － yy.png</br>
            － ...</br>

In [9]:
train_dir = "D:/zbDS/Project/Part8_DL/_dataset/mnist_png/training/"
test_dir = "D:/zbDS/Project/Part8_DL/_dataset/mnist_png/testing/"

In [10]:
os.listdir(train_dir)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [11]:
os.listdir(train_dir + "9")[:10]

['10003.png',
 '10004.png',
 '10023.png',
 '10028.png',
 '10038.png',
 '10043.png',
 '10047.png',
 '1005.png',
 '10055.png',
 '10059.png']

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


In [13]:
train_dataset = datasets.ImageFolder(
	root=train_dir,
    transform=transforms.Compose(
		[transforms.ToTensor(), transforms.Normalize(0.5, 0.5)]
	)
)
test_dataset = datasets.ImageFolder(
	root=test_dir,
    transform=transforms.Compose(
		[transforms.ToTensor(), transforms.Normalize(0.5, 0.5)]
	)
)

In [14]:
x, y = next(iter(train_dataset))
x.shape, y

(torch.Size([3, 28, 28]), 0)

In [15]:
train_loader = torch.utils.data.DataLoader(
	train_dataset, batch_size=32, shuffle=True
)
test_loader = torch.utils.data.DataLoader(
	test_dataset, batch_size=32
)

In [16]:
x, y = next(iter(train_loader))
x.shape, y.shape

(torch.Size([32, 3, 28, 28]), torch.Size([32]))

## Custom dataset

### - Dataset sub-class

> 직접 `torch.utils.data.Dataset`을 상속받아서 데이터셋 구현하기

In [17]:
import os
from glob import glob

import torch
from torchvision import datasets, transforms

from PIL import Image

In [18]:
cifar_dir = 'D:/zbDS/Project/Part8_DL/_dataset/cifar/'

os.listdir(cifar_dir)

['labels.txt', 'test', 'test_dataset.csv', 'train', 'train_dataset.csv']

In [19]:
train_dir = cifar_dir + 'train'
test_dir = cifar_dir + 'test'

os.listdir(train_dir)[:10]

['0_frog.png',
 '10000_automobile.png',
 '10001_frog.png',
 '10002_frog.png',
 '10003_ship.png',
 '10004_ship.png',
 '10005_cat.png',
 '10006_deer.png',
 '10007_frog.png',
 '10008_airplane.png']

- path parsing이 필요

In [20]:
os.path.join(cifar_dir, 'labels.text')

'D:/zbDS/Project/Part8_DL/_dataset/cifar/labels.text'

In [22]:
with open(os.path.join(cifar_dir, 'labels.txt'), 'r') as f:
    label_list = f.read()
    
label_list

'airplane\nautomobile\nbird\ncat\ndeer\ndog\nfrog\nhorse\nship\ntruck\n'

In [24]:
with open(os.path.join(cifar_dir, 'labels.txt'), 'r') as f:
    label_list = f.read().strip().split("\n")

print(label_list)

['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']


In [25]:
label_list.index("deer")

4

In [26]:
train_paths = glob(train_dir + "/*.png")
test_paths = glob(test_dir + "/*.png")

In [27]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_paths, transform=None):
        super(Dataset).__init__()
        self.data_paths = data_paths
        self.transform = transform
    
    def __len__(self, ):
        return len(self.data_paths)
    
    
    # index를 입력하면 해당하는 데이터를 파싱해서 이미지와 레이블로 만들어 주는 메서드
    def __getitem__(self, idx):
        path = self.data_paths[idx]
        image = Image.open(path)
        label_name = path.split(".png")[0].split("_")[-1].strip()
        label = label_list.index(label_name)
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
batch_size = 32

train_loader = torch.utils.data.DataLoader(
    Dataset(train_paths, transform=transforms.ToTensor()),
    batch_size=batch_size,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    Dataset(test_paths, transform=transforms.ToTensor()),
    batch_size=batch_size
)

In [31]:
x, y = next(iter(train_loader))
x.shape, y.shape

(torch.Size([32, 3, 32, 32]), torch.Size([32]))