## Dataset

In [41]:
import torch
from torch.utils.data import Dataset

In [42]:
class CustomDataset_jy(Dataset):
  def __init__(self, data, labels, transform=None):
    self.data = torch.tensor(data).float()
    # 작아서 이렇게 한번에 넣고 처리함
    # 만약, 일반적인 경우에서도 이렇게 하면 부하가 많이 가해져서 위험할  수도 있음
    self.labels = torch.tensor(labels).float()
    self.transform = None # transform 설정

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx]
    label = self.labels[idx]
    if self.transform: # instance별로 transform 적용
      sample = self.transform(sample)
    return sample, label

x = [0.5,  14.0, 15.0, 28.0, 11.0,  8.0,  3.0, -4.0,  6.0, 13.0, 21.0]
y = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]

dataset = CustomDataset_jy(x, y)

In [43]:
i = iter(dataset)
print(next(i))
# print(i.__next__())

(tensor(0.5000), tensor(35.7000))


## DataLoader

In [44]:
from torch.utils.data import DataLoader

In [None]:
data_loader = DataLoader(
    dataset,     # torch.utils.data.Dataset의 instance
    batch_size,  # batch의 샘플수
    shffule,     # boolean, 셔플링을 할지 여부(순서를 랜덤하게)
    num_workers, # 데이터로딩에 사용되는 sub-process의 수 (CPU의 core수를 넘으면 안됨.)
    pin_memory,  # boolean, GPU memory 영역을 예약할지 여부(pin).
    drop_last,   # boolean, 마지막 batch가 샘플의 수가 맞지 않을 경우 dorp할지 여부.
    collate_fn,  # callable, 샘플 리스트를 배치로 변환하는 함수
                 # None: 기본 collate_fn 사용 (텐서 자동 스택, 딕셔너리 처리 등)
                 # 커스텀 함수: 가변 길이 패딩, 배치 단위 전처리 등에 사용
    )

위에는 그냥 두기

In [45]:
data_loader = DataLoader(
    dataset,
    batch_size = 4,
    shuffle = True,
)

for batch_idx, (data, labels) in enumerate(data_loader):
    print(f'{batch_idx=}')
    print(f'{data.shape} | {data=}')
    print(f'{labels.shape} | {labels=}')

    # training ...

batch_idx=0
torch.Size([4]) | data=tensor([15.0000, 28.0000,  6.0000,  0.5000])
torch.Size([4]) | labels=tensor([58.2000, 81.9000, 48.4000, 35.7000])
batch_idx=1
torch.Size([4]) | data=tensor([13., 11., -4.,  3.])
torch.Size([4]) | labels=tensor([60.4000, 56.3000, 21.8000, 33.9000])
batch_idx=2
torch.Size([3]) | data=tensor([ 8., 21., 14.])
torch.Size([3]) | labels=tensor([48.9000, 68.4000, 55.9000])


## 이 아래는 dataset을 list로 해서 load로 .....

In [46]:
class ListDataset_jy(Dataset):
    def __init__(self, transform=None):
        self.data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
        self.labels = [1, 2, 3]
        self.transform = transform  # transform을 초기화할 때 인자로 받기

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:  # transform이 있을 때만 적용
            sample = self.transform(sample)
        return sample, label

In [47]:
dataset = ListDataset_jy()
print(dataset[0])

([1, 2, 3], 1)


In [48]:
for idx, i in enumerate(dataset):
  print(f"{idx = :02} | {i = }")

idx = 00 | i = ([1, 2, 3], 1)
idx = 01 | i = ([4, 5, 6], 2)
idx = 02 | i = ([7, 8, 9], 3)


In [49]:
dl = DataLoader(dataset, batch_size=2, )

In [50]:
for idx, i in enumerate(dl):
  print(i)

[[tensor([1, 4]), tensor([2, 5]), tensor([3, 6])], tensor([1, 2])]
[[tensor([7]), tensor([8]), tensor([9])], tensor([3])]


In [51]:
dl = DataLoader(dataset, batch_size=2, shuffle=False)

In [52]:
for idx, i in enumerate(dl):
  print(f"{idx = :02} | {i = }")

idx = 00 | i = [[tensor([1, 4]), tensor([2, 5]), tensor([3, 6])], tensor([1, 2])]
idx = 01 | i = [[tensor([7]), tensor([8]), tensor([9])], tensor([3])]


## Preprocessing: Dataset ( raw data )
boston hoousing data 사용



In [56]:
%%writefile data.csv
ID,Name,Age,Income,SignUpDate
1,John Doe,28,60000,2021-01-01
2,Jane Smith,34,70000,2021-01-15
3,Alice Johnson,45,80000,2021-02-01

Overwriting data.csv


In [54]:
import pandas as pd

In [63]:
# df = pd.read_csv(' ./data.csv',) #위에 wirtefile 저걸로 할 경우엔 이 코드로, 아니면 아래처럼 링크로
# df = pd.read_csv(' ./data.csv', header=None)
# df = pd.read_csv(' ./data.csv', index_col= )

In [69]:
# CSV 파일 읽기 및 파라미터로 데이터 처리
df = pd.read_csv(
    'data.csv',
    index_col='ID',                   # 'ID' 열을 인덱스로 사용
    parse_dates=['SignUpDate'],       # 'SignUpDate' 열을 날짜 형식으로 파싱
    dtype={'Age': 'int32', 'Income': 'float32'},  # 데이터 타입 지정
    na_values={'Income': ['NA', '?']} # 'Income' 열에서 NA 또는 ?을 결측치로 인식
)

# 데이터 확인
print(df)

# 나이와 연봉 열만 선택
age_income_df = df[['Age', 'Income']]

# 데이터 출력
print(age_income_df)

             Name  Age   Income SignUpDate
ID                                        
1        John Doe   28  60000.0 2021-01-01
2      Jane Smith   34  70000.0 2021-01-15
3   Alice Johnson   45  80000.0 2021-02-01
    Age   Income
ID              
1    28  60000.0
2    34  70000.0
3    45  80000.0


In [76]:
# 다음 URL은 이를 처리한 데이터임.
data_url = 'https://blog.kakaocdn.net/dn/bBaIM3/btsGBsBNUDl/irTKKK1MF1Y1o6JHpGn3n1/boston.csv?attach=1&knm=tfile.csv'

# read_csv
df = pd.read_csv(
    data_url,      # url or file path
    skiprows = 0,  # num of rows to skip
    header = 0,    # header row, None인 경우 header가 없음
    sep = ','      # separator
)

In [77]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RN,AGE,DIS,RAD,TAX,PIRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [78]:
tmp_raw = df.values
tmp_raw.shape

(506, 14)

In [81]:
type(tmp_raw)

numpy.ndarray

In [83]:
x_raw = tmp_raw[:,:13]
y_raw = tmp_raw[:,13]

print(x_raw.shape, x_raw.ndim)
print(y_raw.shape, y_raw.ndim)

(506, 13) 2
(506,) 1


In [80]:
y_raw = y_raw.reshape(-1,1)
print(y_raw.shape)

(506, 1)


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 1 to 3
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Name        3 non-null      object        
 1   Age         3 non-null      int32         
 2   Income      3 non-null      float32       
 3   SignUpDate  3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float32(1), int32(1), object(1)
memory usage: 96.0+ bytes


In [74]:
df.columns

Index(['Name', 'Age', 'Income', 'SignUpDate'], dtype='object')

## from original data : 생략가능
Boston data는 약간의 처리가 필요함.

이는 아래와 같은 특성이 있기 때문임.
* 하나의 데이터가 2라인으로 분포하고 있음. 이를 하나의 라인으로 처리.
* 짝수행의 데이터에서는 3개 열 외는 NaN이므로 제거해야함.
* 짝수행의 3번째 열이 바로 집값에 해당함.

이를 반영하여 506개의 sample을 가지며, 하나의 sample은 13개의 feature를 가지는 input data인 `x_raw`와 이에 대응하는 label값을 가지는 `y_raw`를 생성해야함.

In [75]:
data_url = 'http://lib.stat.cmu.edu/datasets/boston'

df = pd.read_csv(
    data_url,
    skiprows = 22,
    header = None,
    sep = '\s+'
)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [84]:
# df.head(3)
df.tail(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RN,AGE,DIS,RAD,TAX,PIRATIO,B,LSTAT,MEDV
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RN       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PIRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [86]:
type(df.values)

numpy.ndarray

In [87]:
import numpy as np

In [88]:
tmp_raw = df.values
x_raw = np.concatenate([tmp_raw[::2,:], tmp_raw[1::2,:2]], axis=1)
y_raw = tmp_raw[1::2,2].reshape(-1,1)

print(x_raw.shape)
print(y_raw.shape)

(253, 16)
(253, 1)
