In [10]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [7]:
filename = '../data/iris.csv'

In [8]:
irisDF = pd.read_csv(filename, encoding='utf8')
irisDF.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [12]:
featureNP = irisDF[irisDF.columns[:-1]].values
targetNP = irisDF[irisDF.columns[-1]].values

In [94]:
from sklearn.preprocessing import LabelEncoder

encoded_targetNP = LabelEncoder().fit_transform(targetNP)
# encoded_targetNP = encoded_targetNP.reshape(-1, 1)
print(encoded_targetNP.shape)

(150,)


In [95]:
encoded_targetNP[:5]

array([0, 0, 0, 0, 0])

In [104]:
class CustomDataset(Dataset):
    # 초기화 함수
    def __init__(self, x_data, y_data):
        super().__init__()
        
        # x, y 데이터가 DF라면 ndarray로 변환해서 저장
        self.x_data = x_data.values if isinstance(x_data, pd.DataFrame) else x_data
        self.y_data = y_data.values if isinstance(y_data, pd.DataFrame) else y_data
        
        self.classes = np.unique(self.y_data).tolist()
        self.nclasses = len(self.classes)
        
        # ndarray -> FloatTensor로 변환
        self.x_data = torch.FloatTensor(self.x_data)
        self.y_data = torch.LongTensor(self.y_data)
        
    # 총 데이터의 개수 리턴
    def __len__(self):
        return self.x_data.shape[0]
    
    # 인덱스에 해당하는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx]
        return x, y

In [105]:
irisDS = CustomDataset(featureNP, encoded_targetNP)
irisDS[75]

(tensor([6.6000, 3.0000, 4.4000, 1.4000]), tensor(1))

In [106]:
len(irisDS)

150

In [107]:
irisDS.nclasses, irisDS.classes

(3, [0, 1, 2])

DataLoader
- batch_size, shuffle, drop_last, sampler

In [108]:
### 로딩된 데이터 확인 함수
def print_batch_data(loader, epochs, batch_size=1, shuffle=False, drop_last=False, sampler=None) -> None:
    print(f'[설정값] batch_size : {batch_size}, shuffle : {shuffle}, drop_last : {drop_last}, sampler : {sampler}')
    # print(f'[loader] : {loader}')
    
    for ep in range(epochs):
        print(f'[{ep} EPOCHS]', '='*5, f'batch : {len(loader)}개')
        for (feature, label) in loader:
            # print(f'label 타입 : {type(label)}')
            print(feature.shape, label.shape, label)

In [109]:
### [기본사용] batch_size=1, shuffle=False, drop_last=False, sampler=None
loader = DataLoader(irisDS, batch_size=22, shuffle=True, drop_last=True)

for (feature, label) in loader:
    print(feature.shape, label.shape, label.bincount())     # tensor.bincount() : DF.value_counts() 와 동일 (정수형만)

torch.Size([22, 4]) torch.Size([22]) tensor([9, 8, 5])
torch.Size([22, 4]) torch.Size([22]) tensor([ 6, 12,  4])
torch.Size([22, 4]) torch.Size([22]) tensor([7, 7, 8])
torch.Size([22, 4]) torch.Size([22]) tensor([10,  5,  7])
torch.Size([22, 4]) torch.Size([22]) tensor([ 7,  3, 12])
torch.Size([22, 4]) torch.Size([22]) tensor([5, 8, 9])


In [110]:
print_batch_data(loader=loader, epochs=2)

[설정값] batch_size : 1, shuffle : False, drop_last : False, sampler : None
[0 EPOCHS] ===== batch : 6개
torch.Size([22, 4]) torch.Size([22]) tensor([1, 2, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 1, 2, 0, 0, 2, 2, 1, 0, 0, 0])
torch.Size([22, 4]) torch.Size([22]) tensor([1, 1, 0, 2, 2, 1, 0, 0, 0, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2])
torch.Size([22, 4]) torch.Size([22]) tensor([2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 2, 1, 2, 1, 0, 2, 0, 1, 0, 0, 1])
torch.Size([22, 4]) torch.Size([22]) tensor([1, 0, 0, 2, 1, 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 2, 2])
torch.Size([22, 4]) torch.Size([22]) tensor([1, 2, 2, 0, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 0, 1, 1, 1, 2, 2, 0])
torch.Size([22, 4]) torch.Size([22]) tensor([0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 2, 0, 2, 1, 2, 0, 1, 1, 2, 0, 0, 2])
[1 EPOCHS] ===== batch : 6개
torch.Size([22, 4]) torch.Size([22]) tensor([1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 0, 1, 0, 1, 1, 2, 0])
torch.Size([22, 4]) torch.Size([22]) tensor([1, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 1, 2, 1, 

과제 : 복원추출, 가중치 부여, WeightedSampler 사용해서 DataLoader 만들기