# utils.data.DataLoader

In [None]:
# Getting toy dataset
!wget https://github.com/DeepStudio-TW/torch-dataloader-tutorial/raw/main/data.csv

In [1]:
import torch
import time
import pandas as pd
from torch.utils import data
import numpy as np

## Declair Dataset Class

In [2]:
class NumDataset(data.Dataset):
    '''Init: 使用變數宣告- data來源、總量、preprocess方法...等等'''
    def __init__(self,fname,number_length):
        super().__init__()
        self.df=pd.read_csv(fname).head(number_length)
        self.number_length=number_length
        
        self.data=self.df.data.values
        self.label=self.df.label.values
    '''一定要宣告長度，自訂義'''
    def __len__(self):
        return self.number_length
    '''定義回傳一筆資料的方式，input會是某個index, 輸出data, 以及label'''
    def __getitem__(self, idx):
        data=self.data[idx]
        label=self.label[idx]
        return data,label

In [3]:
'''之後就可以用這個dataset class 來query'''
dataset=NumDataset("data.csv",6)

## Declair Dataloader Class

In [4]:
'''Data Loader這個class 可以把data load成torch tensor, 預設沒有shuffle,且batch size為1'''
loader=data.DataLoader(dataset)
'''可iterate，每次會依序call 包含dataset的 __getitem__'''
for d,l in loader:
    print(f"{d}{type(d)}|{l} {type(l)}")

tensor([0.3315], dtype=torch.float64)<class 'torch.Tensor'>|tensor([0]) <class 'torch.Tensor'>
tensor([0.2033], dtype=torch.float64)<class 'torch.Tensor'>|tensor([1]) <class 'torch.Tensor'>
tensor([-1.5153], dtype=torch.float64)<class 'torch.Tensor'>|tensor([2]) <class 'torch.Tensor'>
tensor([-0.9327], dtype=torch.float64)<class 'torch.Tensor'>|tensor([3]) <class 'torch.Tensor'>
tensor([-0.7553], dtype=torch.float64)<class 'torch.Tensor'>|tensor([4]) <class 'torch.Tensor'>
tensor([-0.1205], dtype=torch.float64)<class 'torch.Tensor'>|tensor([5]) <class 'torch.Tensor'>


In [5]:
'''直接丟值進去也可以'''
loader=data.DataLoader([(x,y) for x,y in zip(np.random.randn(5),np.arange(0,5))])
'''出來一樣變tensor'''
for d,l in loader:
    print(f"{d}{type(d)}|{l} {type(l)}")

tensor([-1.9256], dtype=torch.float64)<class 'torch.Tensor'>|tensor([0]) <class 'torch.Tensor'>
tensor([-0.6081], dtype=torch.float64)<class 'torch.Tensor'>|tensor([1]) <class 'torch.Tensor'>
tensor([-1.2712], dtype=torch.float64)<class 'torch.Tensor'>|tensor([2]) <class 'torch.Tensor'>
tensor([0.6443], dtype=torch.float64)<class 'torch.Tensor'>|tensor([3]) <class 'torch.Tensor'>
tensor([0.4731], dtype=torch.float64)<class 'torch.Tensor'>|tensor([4]) <class 'torch.Tensor'>


## Batch

In [6]:
'''Data Loader這個class 可以把data load成torch tensor, 預設沒有shuffle,且batch size為1'''
loader=data.DataLoader(dataset,batch_size=2)
'''可設batch參數，一次讀多一點'''
for d,l in loader:
    print(f"{d}{type(d)}|{l} {type(l)}")

tensor([0.3315, 0.2033], dtype=torch.float64)<class 'torch.Tensor'>|tensor([0, 1]) <class 'torch.Tensor'>
tensor([-1.5153, -0.9327], dtype=torch.float64)<class 'torch.Tensor'>|tensor([2, 3]) <class 'torch.Tensor'>
tensor([-0.7553, -0.1205], dtype=torch.float64)<class 'torch.Tensor'>|tensor([4, 5]) <class 'torch.Tensor'>


In [8]:
dataset_=NumDataset("data.csv",5000)

In [9]:
loader=data.DataLoader(dataset_)
onset=time.time()
for d,l in loader:
    d=d.cuda() # 把tensor丟進GPU
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 3.6720798015594482  s


In [10]:
'''若開batch，讀取速度本身會加快一點'''
loader=data.DataLoader(dataset_,batch_size=4)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.11053586006164551  s


In [11]:
'''加大batch size會更快'''
loader=data.DataLoader(dataset_,batch_size=50)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.018474578857421875  s


In [12]:
'''但加大batch size到一定程度，邊際效應遞減'''
loader=data.DataLoader(dataset_,batch_size=100)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.014531612396240234  s


In [13]:
'''Drop last可以把不滿batch_size的丟掉'''
loader=data.DataLoader(dataset,batch_size=2,drop_last=True)
for d,l in loader:
    print(f"{d}{type(d)}|{l} {type(l)}")

tensor([0.3315, 0.2033], dtype=torch.float64)<class 'torch.Tensor'>|tensor([0, 1]) <class 'torch.Tensor'>
tensor([-1.5153, -0.9327], dtype=torch.float64)<class 'torch.Tensor'>|tensor([2, 3]) <class 'torch.Tensor'>
tensor([-0.7553, -0.1205], dtype=torch.float64)<class 'torch.Tensor'>|tensor([4, 5]) <class 'torch.Tensor'>


## Shuffle

In [14]:
'''Shuffle可以將資料打亂'''
loader=data.DataLoader(dataset,shuffle=True)
for d,l in loader:
    print(f"{d}{type(d)}|{l} {type(l)}")

tensor([-0.7553], dtype=torch.float64)<class 'torch.Tensor'>|tensor([4]) <class 'torch.Tensor'>
tensor([-0.9327], dtype=torch.float64)<class 'torch.Tensor'>|tensor([3]) <class 'torch.Tensor'>
tensor([-0.1205], dtype=torch.float64)<class 'torch.Tensor'>|tensor([5]) <class 'torch.Tensor'>
tensor([0.3315], dtype=torch.float64)<class 'torch.Tensor'>|tensor([0]) <class 'torch.Tensor'>
tensor([-1.5153], dtype=torch.float64)<class 'torch.Tensor'>|tensor([2]) <class 'torch.Tensor'>
tensor([0.2033], dtype=torch.float64)<class 'torch.Tensor'>|tensor([1]) <class 'torch.Tensor'>


In [15]:
'''Shuffle不會多花太多時間'''
dataset_=NumDataset("data.csv",5000)
loader=data.DataLoader(dataset_,shuffle=True,batch_size=4)
onset=time.time()
for d,l in loader:
    d=d.cuda() # 把tensor丟進GPU
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.12830543518066406  s


## Workers

In [16]:
'''Workers number決定多線程執行的程度，預設是0就是沒有分，1以上會去多開執行緒'''
'''在commanand line上打"ps"可以觀察到多開執行緒這件事，開執行緒本身很慢，所以在執行loading很大時才看得出效果'''
dataset__=NumDataset("data.csv",500000)
loader=data.DataLoader(dataset__,batch_size=5000)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")


time elapsed: 1.184007167816162  s


In [17]:
loader=data.DataLoader(dataset__,num_workers=1,batch_size=5000)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 1.383861780166626  s


In [18]:
loader=data.DataLoader(dataset__,num_workers=4,batch_size=5000)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.6054277420043945  s


**Batch size調小**

In [19]:
'''在資料loading很快時(batch size很小，或是資料大小很小時)，開執行緒只是阻礙'''
loader=data.DataLoader(dataset__,batch_size=10)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 4.99813437461853  s


In [20]:
loader=data.DataLoader(dataset__,num_workers=1,batch_size=10)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 49.45866084098816  s


In [21]:
loader=data.DataLoader(dataset__,num_workers=4,batch_size=10)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 41.144562005996704  s


**Persistancce workers**

In [23]:
'''剛剛不是說開workers也是要時間嗎?'''
'''使用persistant worker可以再第二次使用時加快速度'''
loader=data.DataLoader(dataset__,num_workers=4,batch_size=5000,persistent_workers=True)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.5976366996765137  s


In [24]:
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.38565492630004883  s


In [25]:
'''第二次使用不見得要全部跑完才算，可以load一組就好'''
loader=data.DataLoader(dataset__,num_workers=4,batch_size=5000,persistent_workers=True)
for d,l in loader:
    break
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.4409351348876953  s


## Prefetch

In [34]:
'''預讀取資料，在一筆做完前就開始讀下一筆，設定連續讀數筆batch之後再慢慢一筆筆assign給output，預設預讀2筆'''
'''num_workers一定要大於1'''

'''在資料少的情況下prefetch也可能成為阻礙速度的原因'''
loader=data.DataLoader(dataset__,prefetch_factor=1,num_workers=4,batch_size=5000)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 0.6133651733398438  s


In [35]:
'''但要是 data很多的話這些prefetch就會有點用，資料不多的時候也可以把workers，prefetch關掉'''
loader=data.DataLoader(dataset__+dataset__+dataset__+dataset__+dataset__+dataset__,prefetch_factor=2,num_workers=4,batch_size=5000,persistent_workers=True)
for d,l in loader:
    break
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 4.1368043422698975  s


In [36]:
loader=data.DataLoader(dataset__+dataset__+dataset__+dataset__+dataset__+dataset__,prefetch_factor=4,num_workers=4,batch_size=5000,persistent_workers=True)
for d,l in loader:
    break
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 3.779249429702759  s


## Pin memory

In [37]:
'''先把要丟到GPU的Tensor存到暫存空間，從CPU丟到GPU會變快'''
loader=data.DataLoader(dataset__+dataset__+dataset__+dataset__+dataset__+dataset__,prefetch_factor=4,num_workers=4,batch_size=5000,pin_memory=True)
onset=time.time()
for d,l in loader:
    d=d.cuda()
    l=l.cuda()
    pass
print("time elapsed:",time.time()-onset," s")

time elapsed: 4.087360143661499  s
